summaryrefslogtreecommitdiff
path: root/ipc
diff options
context:
space:
mode:
Diffstat (limited to 'ipc')
-rw-r--r--ipc/Makefile12
-rw-r--r--ipc/compat.c757
-rw-r--r--ipc/compat_mq.c138
-rw-r--r--ipc/ipc_sysctl.c224
-rw-r--r--ipc/kdbus/Makefile33
-rw-r--r--ipc/kdbus/bus.c514
-rw-r--r--ipc/kdbus/bus.h99
-rw-r--r--ipc/kdbus/connection.c2207
-rw-r--r--ipc/kdbus/connection.h261
-rw-r--r--ipc/kdbus/domain.c296
-rw-r--r--ipc/kdbus/domain.h77
-rw-r--r--ipc/kdbus/endpoint.c275
-rw-r--r--ipc/kdbus/endpoint.h67
-rw-r--r--ipc/kdbus/fs.c508
-rw-r--r--ipc/kdbus/fs.h28
-rw-r--r--ipc/kdbus/handle.c709
-rw-r--r--ipc/kdbus/handle.h103
-rw-r--r--ipc/kdbus/item.c293
-rw-r--r--ipc/kdbus/item.h61
-rw-r--r--ipc/kdbus/limits.h61
-rw-r--r--ipc/kdbus/main.c114
-rw-r--r--ipc/kdbus/match.c546
-rw-r--r--ipc/kdbus/match.h35
-rw-r--r--ipc/kdbus/message.c1040
-rw-r--r--ipc/kdbus/message.h120
-rw-r--r--ipc/kdbus/metadata.c1342
-rw-r--r--ipc/kdbus/metadata.h86
-rw-r--r--ipc/kdbus/names.c770
-rw-r--r--ipc/kdbus/names.h74
-rw-r--r--ipc/kdbus/node.c897
-rw-r--r--ipc/kdbus/node.h86
-rw-r--r--ipc/kdbus/notify.c204
-rw-r--r--ipc/kdbus/notify.h30
-rw-r--r--ipc/kdbus/policy.c489
-rw-r--r--ipc/kdbus/policy.h51
-rw-r--r--ipc/kdbus/pool.c728
-rw-r--r--ipc/kdbus/pool.h46
-rw-r--r--ipc/kdbus/queue.c363
-rw-r--r--ipc/kdbus/queue.h84
-rw-r--r--ipc/kdbus/reply.c252
-rw-r--r--ipc/kdbus/reply.h68
-rw-r--r--ipc/kdbus/util.c156
-rw-r--r--ipc/kdbus/util.h73
-rw-r--r--ipc/mq_sysctl.c124
-rw-r--r--ipc/mqueue.c1462
-rw-r--r--ipc/msg.c1046
-rw-r--r--ipc/msgutil.c185
-rw-r--r--ipc/namespace.c175
-rw-r--r--ipc/sem.c2188
-rw-r--r--ipc/shm.c1368
-rw-r--r--ipc/syscall.c99
-rw-r--r--ipc/util.c883
-rw-r--r--ipc/util.h207
53 files changed, 22114 insertions, 0 deletions
diff --git a/ipc/Makefile b/ipc/Makefile
new file mode 100644
index 000000000..68ec4167d
--- /dev/null
+++ b/ipc/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux ipc.
+#
+
+obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
+obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o
+obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o
+obj_mq-$(CONFIG_COMPAT) += compat_mq.o
+obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
+obj-$(CONFIG_IPC_NS) += namespace.o
+obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o
+obj-$(CONFIG_KDBUS) += kdbus/
diff --git a/ipc/compat.c b/ipc/compat.c
new file mode 100644
index 000000000..9b3c85f8a
--- /dev/null
+++ b/ipc/compat.c
@@ -0,0 +1,757 @@
+/*
+ * 32 bit compatibility code for System V IPC
+ *
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 2000 VA Linux Co
+ * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 2000 Hewlett-Packard Co.
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000 Gerhard Tonn (ton@de.ibm.com)
+ * Copyright (C) 2000-2002 Andi Kleen, SuSE Labs (x86-64 port)
+ * Copyright (C) 2000 Silicon Graphics, Inc.
+ * Copyright (C) 2001 IBM
+ * Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ * Copyright (C) 2004 Arnd Bergmann (arnd@arndb.de)
+ *
+ * This code is collected from the versions for sparc64, mips64, s390x, ia64,
+ * ppc64 and x86_64, all of which are based on the original sparc64 version
+ * by Jakub Jelinek.
+ *
+ */
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/highuid.h>
+#include <linux/init.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/syscalls.h>
+#include <linux/ptrace.h>
+
+#include <linux/mutex.h>
+#include <linux/uaccess.h>
+
+#include "util.h"
+
+struct compat_msgbuf {
+ compat_long_t mtype;
+ char mtext[1];
+};
+
+struct compat_ipc_perm {
+ key_t key;
+ __compat_uid_t uid;
+ __compat_gid_t gid;
+ __compat_uid_t cuid;
+ __compat_gid_t cgid;
+ compat_mode_t mode;
+ unsigned short seq;
+};
+
+struct compat_semid_ds {
+ struct compat_ipc_perm sem_perm;
+ compat_time_t sem_otime;
+ compat_time_t sem_ctime;
+ compat_uptr_t sem_base;
+ compat_uptr_t sem_pending;
+ compat_uptr_t sem_pending_last;
+ compat_uptr_t undo;
+ unsigned short sem_nsems;
+};
+
+struct compat_msqid_ds {
+ struct compat_ipc_perm msg_perm;
+ compat_uptr_t msg_first;
+ compat_uptr_t msg_last;
+ compat_time_t msg_stime;
+ compat_time_t msg_rtime;
+ compat_time_t msg_ctime;
+ compat_ulong_t msg_lcbytes;
+ compat_ulong_t msg_lqbytes;
+ unsigned short msg_cbytes;
+ unsigned short msg_qnum;
+ unsigned short msg_qbytes;
+ compat_ipc_pid_t msg_lspid;
+ compat_ipc_pid_t msg_lrpid;
+};
+
+struct compat_shmid_ds {
+ struct compat_ipc_perm shm_perm;
+ int shm_segsz;
+ compat_time_t shm_atime;
+ compat_time_t shm_dtime;
+ compat_time_t shm_ctime;
+ compat_ipc_pid_t shm_cpid;
+ compat_ipc_pid_t shm_lpid;
+ unsigned short shm_nattch;
+ unsigned short shm_unused;
+ compat_uptr_t shm_unused2;
+ compat_uptr_t shm_unused3;
+};
+
+struct compat_ipc_kludge {
+ compat_uptr_t msgp;
+ compat_long_t msgtyp;
+};
+
+struct compat_shminfo64 {
+ compat_ulong_t shmmax;
+ compat_ulong_t shmmin;
+ compat_ulong_t shmmni;
+ compat_ulong_t shmseg;
+ compat_ulong_t shmall;
+ compat_ulong_t __unused1;
+ compat_ulong_t __unused2;
+ compat_ulong_t __unused3;
+ compat_ulong_t __unused4;
+};
+
+struct compat_shm_info {
+ compat_int_t used_ids;
+ compat_ulong_t shm_tot, shm_rss, shm_swp;
+ compat_ulong_t swap_attempts, swap_successes;
+};
+
+static inline int compat_ipc_parse_version(int *cmd)
+{
+#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
+ int version = *cmd & IPC_64;
+
+ /* this is tricky: architectures that have support for the old
+ * ipc structures in 64 bit binaries need to have IPC_64 set
+ * in cmd, the others need to have it cleared */
+#ifndef ipc_parse_version
+ *cmd |= IPC_64;
+#else
+ *cmd &= ~IPC_64;
+#endif
+ return version;
+#else
+ /* With the asm-generic APIs, we always use the 64-bit versions. */
+ return IPC_64;
+#endif
+}
+
+static inline int __get_compat_ipc64_perm(struct ipc64_perm *p64,
+ struct compat_ipc64_perm __user *up64)
+{
+ int err;
+
+ err = __get_user(p64->uid, &up64->uid);
+ err |= __get_user(p64->gid, &up64->gid);
+ err |= __get_user(p64->mode, &up64->mode);
+ return err;
+}
+
+static inline int __get_compat_ipc_perm(struct ipc64_perm *p,
+ struct compat_ipc_perm __user *up)
+{
+ int err;
+
+ err = __get_user(p->uid, &up->uid);
+ err |= __get_user(p->gid, &up->gid);
+ err |= __get_user(p->mode, &up->mode);
+ return err;
+}
+
+static inline int __put_compat_ipc64_perm(struct ipc64_perm *p64,
+ struct compat_ipc64_perm __user *up64)
+{
+ int err;
+
+ err = __put_user(p64->key, &up64->key);
+ err |= __put_user(p64->uid, &up64->uid);
+ err |= __put_user(p64->gid, &up64->gid);
+ err |= __put_user(p64->cuid, &up64->cuid);
+ err |= __put_user(p64->cgid, &up64->cgid);
+ err |= __put_user(p64->mode, &up64->mode);
+ err |= __put_user(p64->seq, &up64->seq);
+ return err;
+}
+
+static inline int __put_compat_ipc_perm(struct ipc64_perm *p,
+ struct compat_ipc_perm __user *uip)
+{
+ int err;
+ __compat_uid_t u;
+ __compat_gid_t g;
+
+ err = __put_user(p->key, &uip->key);
+ SET_UID(u, p->uid);
+ err |= __put_user(u, &uip->uid);
+ SET_GID(g, p->gid);
+ err |= __put_user(g, &uip->gid);
+ SET_UID(u, p->cuid);
+ err |= __put_user(u, &uip->cuid);
+ SET_GID(g, p->cgid);
+ err |= __put_user(g, &uip->cgid);
+ err |= __put_user(p->mode, &uip->mode);
+ err |= __put_user(p->seq, &uip->seq);
+ return err;
+}
+
+static inline int get_compat_semid64_ds(struct semid64_ds *sem64,
+ struct compat_semid64_ds __user *up64)
+{
+ if (!access_ok(VERIFY_READ, up64, sizeof(*up64)))
+ return -EFAULT;
+ return __get_compat_ipc64_perm(&sem64->sem_perm, &up64->sem_perm);
+}
+
+static inline int get_compat_semid_ds(struct semid64_ds *s,
+ struct compat_semid_ds __user *up)
+{
+ if (!access_ok(VERIFY_READ, up, sizeof(*up)))
+ return -EFAULT;
+ return __get_compat_ipc_perm(&s->sem_perm, &up->sem_perm);
+}
+
+static inline int put_compat_semid64_ds(struct semid64_ds *sem64,
+ struct compat_semid64_ds __user *up64)
+{
+ int err;
+
+ if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
+ return -EFAULT;
+ err = __put_compat_ipc64_perm(&sem64->sem_perm, &up64->sem_perm);
+ err |= __put_user(sem64->sem_otime, &up64->sem_otime);
+ err |= __put_user(sem64->sem_ctime, &up64->sem_ctime);
+ err |= __put_user(sem64->sem_nsems, &up64->sem_nsems);
+ return err;
+}
+
+static inline int put_compat_semid_ds(struct semid64_ds *s,
+ struct compat_semid_ds __user *up)
+{
+ int err;
+
+ if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
+ return -EFAULT;
+ err = __put_compat_ipc_perm(&s->sem_perm, &up->sem_perm);
+ err |= __put_user(s->sem_otime, &up->sem_otime);
+ err |= __put_user(s->sem_ctime, &up->sem_ctime);
+ err |= __put_user(s->sem_nsems, &up->sem_nsems);
+ return err;
+}
+
+static long do_compat_semctl(int first, int second, int third, u32 pad)
+{
+ unsigned long fourth;
+ int err, err2;
+ struct semid64_ds sem64;
+ struct semid64_ds __user *up64;
+ int version = compat_ipc_parse_version(&third);
+
+ memset(&sem64, 0, sizeof(sem64));
+
+ if ((third & (~IPC_64)) == SETVAL)
+#ifdef __BIG_ENDIAN
+ fourth = (unsigned long)pad << 32;
+#else
+ fourth = pad;
+#endif
+ else
+ fourth = (unsigned long)compat_ptr(pad);
+ switch (third & (~IPC_64)) {
+ case IPC_INFO:
+ case IPC_RMID:
+ case SEM_INFO:
+ case GETVAL:
+ case GETPID:
+ case GETNCNT:
+ case GETZCNT:
+ case GETALL:
+ case SETVAL:
+ case SETALL:
+ err = sys_semctl(first, second, third, fourth);
+ break;
+
+ case IPC_STAT:
+ case SEM_STAT:
+ up64 = compat_alloc_user_space(sizeof(sem64));
+ fourth = (unsigned long)up64;
+ err = sys_semctl(first, second, third, fourth);
+ if (err < 0)
+ break;
+ if (copy_from_user(&sem64, up64, sizeof(sem64)))
+ err2 = -EFAULT;
+ else if (version == IPC_64)
+ err2 = put_compat_semid64_ds(&sem64, compat_ptr(pad));
+ else
+ err2 = put_compat_semid_ds(&sem64, compat_ptr(pad));
+ if (err2)
+ err = -EFAULT;
+ break;
+
+ case IPC_SET:
+ if (version == IPC_64)
+ err = get_compat_semid64_ds(&sem64, compat_ptr(pad));
+ else
+ err = get_compat_semid_ds(&sem64, compat_ptr(pad));
+
+ up64 = compat_alloc_user_space(sizeof(sem64));
+ if (copy_to_user(up64, &sem64, sizeof(sem64)))
+ err = -EFAULT;
+ if (err)
+ break;
+
+ fourth = (unsigned long)up64;
+ err = sys_semctl(first, second, third, fourth);
+ break;
+
+ default:
+ err = -EINVAL;
+ break;
+ }
+ return err;
+}
+
+static long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
+{
+ struct compat_msgbuf __user *msgp = dest;
+ size_t msgsz;
+
+ if (put_user(msg->m_type, &msgp->mtype))
+ return -EFAULT;
+
+ msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz;
+ if (store_msg(msgp->mtext, msg, msgsz))
+ return -EFAULT;
+ return msgsz;
+}
+
+#ifndef COMPAT_SHMLBA
+#define COMPAT_SHMLBA SHMLBA
+#endif
+
+#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
+COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second,
+ u32, third, compat_uptr_t, ptr, u32, fifth)
+{
+ int version;
+ u32 pad;
+
+ version = call >> 16; /* hack for backward compatibility */
+ call &= 0xffff;
+
+ switch (call) {
+ case SEMOP:
+ /* struct sembuf is the same on 32 and 64bit :)) */
+ return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
+ case SEMTIMEDOP:
+ return compat_sys_semtimedop(first, compat_ptr(ptr), second,
+ compat_ptr(fifth));
+ case SEMGET:
+ return sys_semget(first, second, third);
+ case SEMCTL:
+ if (!ptr)
+ return -EINVAL;
+ if (get_user(pad, (u32 __user *) compat_ptr(ptr)))
+ return -EFAULT;
+ return do_compat_semctl(first, second, third, pad);
+
+ case MSGSND: {
+ struct compat_msgbuf __user *up = compat_ptr(ptr);
+ compat_long_t type;
+
+ if (first < 0 || second < 0)
+ return -EINVAL;
+
+ if (get_user(type, &up->mtype))
+ return -EFAULT;
+
+ return do_msgsnd(first, type, up->mtext, second, third);
+ }
+ case MSGRCV: {
+ void __user *uptr = compat_ptr(ptr);
+
+ if (first < 0 || second < 0)
+ return -EINVAL;
+
+ if (!version) {
+ struct compat_ipc_kludge ipck;
+ if (!uptr)
+ return -EINVAL;
+ if (copy_from_user(&ipck, uptr, sizeof(ipck)))
+ return -EFAULT;
+ uptr = compat_ptr(ipck.msgp);
+ fifth = ipck.msgtyp;
+ }
+ return do_msgrcv(first, uptr, second, (s32)fifth, third,
+ compat_do_msg_fill);
+ }
+ case MSGGET:
+ return sys_msgget(first, second);
+ case MSGCTL:
+ return compat_sys_msgctl(first, second, compat_ptr(ptr));
+
+ case SHMAT: {
+ int err;
+ unsigned long raddr;
+
+ if (version == 1)
+ return -EINVAL;
+ err = do_shmat(first, compat_ptr(ptr), second, &raddr,
+ COMPAT_SHMLBA);
+ if (err < 0)
+ return err;
+ return put_user(raddr, (compat_ulong_t *)compat_ptr(third));
+ }
+ case SHMDT:
+ return sys_shmdt(compat_ptr(ptr));
+ case SHMGET:
+ return sys_shmget(first, (unsigned)second, third);
+ case SHMCTL:
+ return compat_sys_shmctl(first, second, compat_ptr(ptr));
+ }
+
+ return -ENOSYS;
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
+{
+ return do_compat_semctl(semid, semnum, cmd, arg);
+}
+
+COMPAT_SYSCALL_DEFINE4(msgsnd, int, msqid, compat_uptr_t, msgp,
+ compat_ssize_t, msgsz, int, msgflg)
+{
+ struct compat_msgbuf __user *up = compat_ptr(msgp);
+ compat_long_t mtype;
+
+ if (get_user(mtype, &up->mtype))
+ return -EFAULT;
+ return do_msgsnd(msqid, mtype, up->mtext, (ssize_t)msgsz, msgflg);
+}
+
+COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
+ compat_ssize_t, msgsz, compat_long_t, msgtyp, int, msgflg)
+{
+ return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, (long)msgtyp,
+ msgflg, compat_do_msg_fill);
+}
+
+static inline int get_compat_msqid64(struct msqid64_ds *m64,
+ struct compat_msqid64_ds __user *up64)
+{
+ int err;
+
+ if (!access_ok(VERIFY_READ, up64, sizeof(*up64)))
+ return -EFAULT;
+ err = __get_compat_ipc64_perm(&m64->msg_perm, &up64->msg_perm);
+ err |= __get_user(m64->msg_qbytes, &up64->msg_qbytes);
+ return err;
+}
+
+static inline int get_compat_msqid(struct msqid64_ds *m,
+ struct compat_msqid_ds __user *up)
+{
+ int err;
+
+ if (!access_ok(VERIFY_READ, up, sizeof(*up)))
+ return -EFAULT;
+ err = __get_compat_ipc_perm(&m->msg_perm, &up->msg_perm);
+ err |= __get_user(m->msg_qbytes, &up->msg_qbytes);
+ return err;
+}
+
+static inline int put_compat_msqid64_ds(struct msqid64_ds *m64,
+ struct compat_msqid64_ds __user *up64)
+{
+ int err;
+
+ if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
+ return -EFAULT;
+ err = __put_compat_ipc64_perm(&m64->msg_perm, &up64->msg_perm);
+ err |= __put_user(m64->msg_stime, &up64->msg_stime);
+ err |= __put_user(m64->msg_rtime, &up64->msg_rtime);
+ err |= __put_user(m64->msg_ctime, &up64->msg_ctime);
+ err |= __put_user(m64->msg_cbytes, &up64->msg_cbytes);
+ err |= __put_user(m64->msg_qnum, &up64->msg_qnum);
+ err |= __put_user(m64->msg_qbytes, &up64->msg_qbytes);
+ err |= __put_user(m64->msg_lspid, &up64->msg_lspid);
+ err |= __put_user(m64->msg_lrpid, &up64->msg_lrpid);
+ return err;
+}
+
+static inline int put_compat_msqid_ds(struct msqid64_ds *m,
+ struct compat_msqid_ds __user *up)
+{
+ int err;
+
+ if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
+ return -EFAULT;
+ err = __put_compat_ipc_perm(&m->msg_perm, &up->msg_perm);
+ err |= __put_user(m->msg_stime, &up->msg_stime);
+ err |= __put_user(m->msg_rtime, &up->msg_rtime);
+ err |= __put_user(m->msg_ctime, &up->msg_ctime);
+ err |= __put_user(m->msg_cbytes, &up->msg_cbytes);
+ err |= __put_user(m->msg_qnum, &up->msg_qnum);
+ err |= __put_user(m->msg_qbytes, &up->msg_qbytes);
+ err |= __put_user(m->msg_lspid, &up->msg_lspid);
+ err |= __put_user(m->msg_lrpid, &up->msg_lrpid);
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE3(msgctl, int, first, int, second, void __user *, uptr)
+{
+ int err, err2;
+ struct msqid64_ds m64;
+ int version = compat_ipc_parse_version(&second);
+ void __user *p;
+
+ memset(&m64, 0, sizeof(m64));
+
+ switch (second & (~IPC_64)) {
+ case IPC_INFO:
+ case IPC_RMID:
+ case MSG_INFO:
+ err = sys_msgctl(first, second, uptr);
+ break;
+
+ case IPC_SET:
+ if (version == IPC_64)
+ err = get_compat_msqid64(&m64, uptr);
+ else
+ err = get_compat_msqid(&m64, uptr);
+
+ if (err)
+ break;
+ p = compat_alloc_user_space(sizeof(m64));
+ if (copy_to_user(p, &m64, sizeof(m64)))
+ err = -EFAULT;
+ else
+ err = sys_msgctl(first, second, p);
+ break;
+
+ case IPC_STAT:
+ case MSG_STAT:
+ p = compat_alloc_user_space(sizeof(m64));
+ err = sys_msgctl(first, second, p);
+ if (err < 0)
+ break;
+ if (copy_from_user(&m64, p, sizeof(m64)))
+ err2 = -EFAULT;
+ else if (version == IPC_64)
+ err2 = put_compat_msqid64_ds(&m64, uptr);
+ else
+ err2 = put_compat_msqid_ds(&m64, uptr);
+ if (err2)
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -EINVAL;
+ break;
+ }
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE3(shmat, int, shmid, compat_uptr_t, shmaddr, int, shmflg)
+{
+ unsigned long ret;
+ long err;
+
+ err = do_shmat(shmid, compat_ptr(shmaddr), shmflg, &ret, COMPAT_SHMLBA);
+ if (err)
+ return err;
+ force_successful_syscall_return();
+ return (long)ret;
+}
+
+static inline int get_compat_shmid64_ds(struct shmid64_ds *sem64,
+ struct compat_shmid64_ds __user *up64)
+{
+ if (!access_ok(VERIFY_READ, up64, sizeof(*up64)))
+ return -EFAULT;
+ return __get_compat_ipc64_perm(&sem64->shm_perm, &up64->shm_perm);
+}
+
+static inline int get_compat_shmid_ds(struct shmid64_ds *s,
+ struct compat_shmid_ds __user *up)
+{
+ if (!access_ok(VERIFY_READ, up, sizeof(*up)))
+ return -EFAULT;
+ return __get_compat_ipc_perm(&s->shm_perm, &up->shm_perm);
+}
+
+static inline int put_compat_shmid64_ds(struct shmid64_ds *sem64,
+ struct compat_shmid64_ds __user *up64)
+{
+ int err;
+
+ if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
+ return -EFAULT;
+ err = __put_compat_ipc64_perm(&sem64->shm_perm, &up64->shm_perm);
+ err |= __put_user(sem64->shm_atime, &up64->shm_atime);
+ err |= __put_user(sem64->shm_dtime, &up64->shm_dtime);
+ err |= __put_user(sem64->shm_ctime, &up64->shm_ctime);
+ err |= __put_user(sem64->shm_segsz, &up64->shm_segsz);
+ err |= __put_user(sem64->shm_nattch, &up64->shm_nattch);
+ err |= __put_user(sem64->shm_cpid, &up64->shm_cpid);
+ err |= __put_user(sem64->shm_lpid, &up64->shm_lpid);
+ return err;
+}
+
+static inline int put_compat_shmid_ds(struct shmid64_ds *s,
+ struct compat_shmid_ds __user *up)
+{
+ int err;
+
+ if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
+ return -EFAULT;
+ err = __put_compat_ipc_perm(&s->shm_perm, &up->shm_perm);
+ err |= __put_user(s->shm_atime, &up->shm_atime);
+ err |= __put_user(s->shm_dtime, &up->shm_dtime);
+ err |= __put_user(s->shm_ctime, &up->shm_ctime);
+ err |= __put_user(s->shm_segsz, &up->shm_segsz);
+ err |= __put_user(s->shm_nattch, &up->shm_nattch);
+ err |= __put_user(s->shm_cpid, &up->shm_cpid);
+ err |= __put_user(s->shm_lpid, &up->shm_lpid);
+ return err;
+}
+
+static inline int put_compat_shminfo64(struct shminfo64 *smi,
+ struct compat_shminfo64 __user *up64)
+{
+ int err;
+
+ if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
+ return -EFAULT;
+ if (smi->shmmax > INT_MAX)
+ smi->shmmax = INT_MAX;
+ err = __put_user(smi->shmmax, &up64->shmmax);
+ err |= __put_user(smi->shmmin, &up64->shmmin);
+ err |= __put_user(smi->shmmni, &up64->shmmni);
+ err |= __put_user(smi->shmseg, &up64->shmseg);
+ err |= __put_user(smi->shmall, &up64->shmall);
+ return err;
+}
+
+static inline int put_compat_shminfo(struct shminfo64 *smi,
+ struct shminfo __user *up)
+{
+ int err;
+
+ if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
+ return -EFAULT;
+ if (smi->shmmax > INT_MAX)
+ smi->shmmax = INT_MAX;
+ err = __put_user(smi->shmmax, &up->shmmax);
+ err |= __put_user(smi->shmmin, &up->shmmin);
+ err |= __put_user(smi->shmmni, &up->shmmni);
+ err |= __put_user(smi->shmseg, &up->shmseg);
+ err |= __put_user(smi->shmall, &up->shmall);
+ return err;
+}
+
+static inline int put_compat_shm_info(struct shm_info __user *ip,
+ struct compat_shm_info __user *uip)
+{
+ int err;
+ struct shm_info si;
+
+ if (!access_ok(VERIFY_WRITE, uip, sizeof(*uip)) ||
+ copy_from_user(&si, ip, sizeof(si)))
+ return -EFAULT;
+ err = __put_user(si.used_ids, &uip->used_ids);
+ err |= __put_user(si.shm_tot, &uip->shm_tot);
+ err |= __put_user(si.shm_rss, &uip->shm_rss);
+ err |= __put_user(si.shm_swp, &uip->shm_swp);
+ err |= __put_user(si.swap_attempts, &uip->swap_attempts);
+ err |= __put_user(si.swap_successes, &uip->swap_successes);
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE3(shmctl, int, first, int, second, void __user *, uptr)
+{
+ void __user *p;
+ struct shmid64_ds sem64;
+ struct shminfo64 smi;
+ int err, err2;
+ int version = compat_ipc_parse_version(&second);
+
+ memset(&sem64, 0, sizeof(sem64));
+
+ switch (second & (~IPC_64)) {
+ case IPC_RMID:
+ case SHM_LOCK:
+ case SHM_UNLOCK:
+ err = sys_shmctl(first, second, uptr);
+ break;
+
+ case IPC_INFO:
+ p = compat_alloc_user_space(sizeof(smi));
+ err = sys_shmctl(first, second, p);
+ if (err < 0)
+ break;
+ if (copy_from_user(&smi, p, sizeof(smi)))
+ err2 = -EFAULT;
+ else if (version == IPC_64)
+ err2 = put_compat_shminfo64(&smi, uptr);
+ else
+ err2 = put_compat_shminfo(&smi, uptr);
+ if (err2)
+ err = -EFAULT;
+ break;
+
+
+ case IPC_SET:
+ if (version == IPC_64)
+ err = get_compat_shmid64_ds(&sem64, uptr);
+ else
+ err = get_compat_shmid_ds(&sem64, uptr);
+
+ if (err)
+ break;
+ p = compat_alloc_user_space(sizeof(sem64));
+ if (copy_to_user(p, &sem64, sizeof(sem64)))
+ err = -EFAULT;
+ else
+ err = sys_shmctl(first, second, p);
+ break;
+
+ case IPC_STAT:
+ case SHM_STAT:
+ p = compat_alloc_user_space(sizeof(sem64));
+ err = sys_shmctl(first, second, p);
+ if (err < 0)
+ break;
+ if (copy_from_user(&sem64, p, sizeof(sem64)))
+ err2 = -EFAULT;
+ else if (version == IPC_64)
+ err2 = put_compat_shmid64_ds(&sem64, uptr);
+ else
+ err2 = put_compat_shmid_ds(&sem64, uptr);
+ if (err2)
+ err = -EFAULT;
+ break;
+
+ case SHM_INFO:
+ p = compat_alloc_user_space(sizeof(struct shm_info));
+ err = sys_shmctl(first, second, p);
+ if (err < 0)
+ break;
+ err2 = put_compat_shm_info(p, uptr);
+ if (err2)
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -EINVAL;
+ break;
+ }
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsems,
+ unsigned, nsops,
+ const struct compat_timespec __user *, timeout)
+{
+ struct timespec __user *ts64;
+ if (compat_convert_timespec(&ts64, timeout))
+ return -EFAULT;
+ return sys_semtimedop(semid, tsems, nsops, ts64);
+}
diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c
new file mode 100644
index 000000000..ef6f91cc4
--- /dev/null
+++ b/ipc/compat_mq.c
@@ -0,0 +1,138 @@
+/*
+ * ipc/compat_mq.c
+ * 32 bit emulation for POSIX message queue system calls
+ *
+ * Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ * Author: Arnd Bergmann <arnd@arndb.de>
+ */
+
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mqueue.h>
+#include <linux/syscalls.h>
+
+#include <linux/uaccess.h>
+
+struct compat_mq_attr {
+ compat_long_t mq_flags; /* message queue flags */
+ compat_long_t mq_maxmsg; /* maximum number of messages */
+ compat_long_t mq_msgsize; /* maximum message size */
+ compat_long_t mq_curmsgs; /* number of messages currently queued */
+ compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
+};
+
+static inline int get_compat_mq_attr(struct mq_attr *attr,
+ const struct compat_mq_attr __user *uattr)
+{
+ if (!access_ok(VERIFY_READ, uattr, sizeof *uattr))
+ return -EFAULT;
+
+ return __get_user(attr->mq_flags, &uattr->mq_flags)
+ | __get_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+ | __get_user(attr->mq_msgsize, &uattr->mq_msgsize)
+ | __get_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+static inline int put_compat_mq_attr(const struct mq_attr *attr,
+ struct compat_mq_attr __user *uattr)
+{
+ if (clear_user(uattr, sizeof *uattr))
+ return -EFAULT;
+
+ return __put_user(attr->mq_flags, &uattr->mq_flags)
+ | __put_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+ | __put_user(attr->mq_msgsize, &uattr->mq_msgsize)
+ | __put_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,
+ int, oflag, compat_mode_t, mode,
+ struct compat_mq_attr __user *, u_attr)
+{
+ void __user *p = NULL;
+ if (u_attr && oflag & O_CREAT) {
+ struct mq_attr attr;
+
+ memset(&attr, 0, sizeof(attr));
+
+ p = compat_alloc_user_space(sizeof(attr));
+ if (get_compat_mq_attr(&attr, u_attr) ||
+ copy_to_user(p, &attr, sizeof(attr)))
+ return -EFAULT;
+ }
+ return sys_mq_open(u_name, oflag, mode, p);
+}
+
+COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes,
+ const char __user *, u_msg_ptr,
+ compat_size_t, msg_len, unsigned int, msg_prio,
+ const struct compat_timespec __user *, u_abs_timeout)
+{
+ struct timespec __user *u_ts;
+
+ if (compat_convert_timespec(&u_ts, u_abs_timeout))
+ return -EFAULT;
+
+ return sys_mq_timedsend(mqdes, u_msg_ptr, msg_len,
+ msg_prio, u_ts);
+}
+
+COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes,
+ char __user *, u_msg_ptr,
+ compat_size_t, msg_len, unsigned int __user *, u_msg_prio,
+ const struct compat_timespec __user *, u_abs_timeout)
+{
+ struct timespec __user *u_ts;
+
+ if (compat_convert_timespec(&u_ts, u_abs_timeout))
+ return -EFAULT;
+
+ return sys_mq_timedreceive(mqdes, u_msg_ptr, msg_len,
+ u_msg_prio, u_ts);
+}
+
+COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
+ const struct compat_sigevent __user *, u_notification)
+{
+ struct sigevent __user *p = NULL;
+ if (u_notification) {
+ struct sigevent n;
+ p = compat_alloc_user_space(sizeof(*p));
+ if (get_compat_sigevent(&n, u_notification))
+ return -EFAULT;
+ if (n.sigev_notify == SIGEV_THREAD)
+ n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);
+ if (copy_to_user(p, &n, sizeof(*p)))
+ return -EFAULT;
+ }
+ return sys_mq_notify(mqdes, p);
+}
+
+COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
+ const struct compat_mq_attr __user *, u_mqstat,
+ struct compat_mq_attr __user *, u_omqstat)
+{
+ struct mq_attr mqstat;
+ struct mq_attr __user *p = compat_alloc_user_space(2 * sizeof(*p));
+ long ret;
+
+ memset(&mqstat, 0, sizeof(mqstat));
+
+ if (u_mqstat) {
+ if (get_compat_mq_attr(&mqstat, u_mqstat) ||
+ copy_to_user(p, &mqstat, sizeof(mqstat)))
+ return -EFAULT;
+ }
+ ret = sys_mq_getsetattr(mqdes,
+ u_mqstat ? p : NULL,
+ u_omqstat ? p + 1 : NULL);
+ if (ret)
+ return ret;
+ if (u_omqstat) {
+ if (copy_from_user(&mqstat, p + 1, sizeof(mqstat)) ||
+ put_compat_mq_attr(&mqstat, u_omqstat))
+ return -EFAULT;
+ }
+ return 0;
+}
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
new file mode 100644
index 000000000..8ad93c29f
--- /dev/null
+++ b/ipc/ipc_sysctl.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (C) 2007
+ *
+ * Author: Eric Biederman <ebiederm@xmision.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/module.h>
+#include <linux/ipc.h>
+#include <linux/nsproxy.h>
+#include <linux/sysctl.h>
+#include <linux/uaccess.h>
+#include <linux/ipc_namespace.h>
+#include <linux/msg.h>
+#include "util.h"
+
+static void *get_ipc(struct ctl_table *table)
+{
+ char *which = table->data;
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+ which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+ return which;
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+static int proc_ipc_dointvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table ipc_table;
+
+ memcpy(&ipc_table, table, sizeof(ipc_table));
+ ipc_table.data = get_ipc(table);
+
+ return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
+}
+
+static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table ipc_table;
+
+ memcpy(&ipc_table, table, sizeof(ipc_table));
+ ipc_table.data = get_ipc(table);
+
+ return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
+}
+
+static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+ int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (err < 0)
+ return err;
+ if (ns->shm_rmid_forced)
+ shm_destroy_orphaned(ns);
+ return err;
+}
+
+static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table ipc_table;
+ memcpy(&ipc_table, table, sizeof(ipc_table));
+ ipc_table.data = get_ipc(table);
+
+ return proc_doulongvec_minmax(&ipc_table, write, buffer,
+ lenp, ppos);
+}
+
+static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table ipc_table;
+ int dummy = 0;
+
+ memcpy(&ipc_table, table, sizeof(ipc_table));
+ ipc_table.data = &dummy;
+
+ if (write)
+ pr_info_once("writing to auto_msgmni has no effect");
+
+ return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
+}
+
+#else
+#define proc_ipc_doulongvec_minmax NULL
+#define proc_ipc_dointvec NULL
+#define proc_ipc_dointvec_minmax NULL
+#define proc_ipc_dointvec_minmax_orphans NULL
+#define proc_ipc_auto_msgmni NULL
+#endif
+
+static int zero;
+static int one = 1;
+static int int_max = INT_MAX;
+
+static struct ctl_table ipc_kern_table[] = {
+ {
+ .procname = "shmmax",
+ .data = &init_ipc_ns.shm_ctlmax,
+ .maxlen = sizeof(init_ipc_ns.shm_ctlmax),
+ .mode = 0644,
+ .proc_handler = proc_ipc_doulongvec_minmax,
+ },
+ {
+ .procname = "shmall",
+ .data = &init_ipc_ns.shm_ctlall,
+ .maxlen = sizeof(init_ipc_ns.shm_ctlall),
+ .mode = 0644,
+ .proc_handler = proc_ipc_doulongvec_minmax,
+ },
+ {
+ .procname = "shmmni",
+ .data = &init_ipc_ns.shm_ctlmni,
+ .maxlen = sizeof(init_ipc_ns.shm_ctlmni),
+ .mode = 0644,
+ .proc_handler = proc_ipc_dointvec,
+ },
+ {
+ .procname = "shm_rmid_forced",
+ .data = &init_ipc_ns.shm_rmid_forced,
+ .maxlen = sizeof(init_ipc_ns.shm_rmid_forced),
+ .mode = 0644,
+ .proc_handler = proc_ipc_dointvec_minmax_orphans,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "msgmax",
+ .data = &init_ipc_ns.msg_ctlmax,
+ .maxlen = sizeof(init_ipc_ns.msg_ctlmax),
+ .mode = 0644,
+ .proc_handler = proc_ipc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ },
+ {
+ .procname = "msgmni",
+ .data = &init_ipc_ns.msg_ctlmni,
+ .maxlen = sizeof(init_ipc_ns.msg_ctlmni),
+ .mode = 0644,
+ .proc_handler = proc_ipc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ },
+ {
+ .procname = "auto_msgmni",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_ipc_auto_msgmni,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "msgmnb",
+ .data = &init_ipc_ns.msg_ctlmnb,
+ .maxlen = sizeof(init_ipc_ns.msg_ctlmnb),
+ .mode = 0644,
+ .proc_handler = proc_ipc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ },
+ {
+ .procname = "sem",
+ .data = &init_ipc_ns.sem_ctls,
+ .maxlen = 4*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_ipc_dointvec,
+ },
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ {
+ .procname = "sem_next_id",
+ .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
+ .mode = 0644,
+ .proc_handler = proc_ipc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ },
+ {
+ .procname = "msg_next_id",
+ .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
+ .mode = 0644,
+ .proc_handler = proc_ipc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ },
+ {
+ .procname = "shm_next_id",
+ .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
+ .mode = 0644,
+ .proc_handler = proc_ipc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &int_max,
+ },
+#endif
+ {}
+};
+
+static struct ctl_table ipc_root_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = ipc_kern_table,
+ },
+ {}
+};
+
+static int __init ipc_sysctl_init(void)
+{
+ register_sysctl_table(ipc_root_table);
+ return 0;
+}
+
+device_initcall(ipc_sysctl_init);
diff --git a/ipc/kdbus/Makefile b/ipc/kdbus/Makefile
new file mode 100644
index 000000000..66663a124
--- /dev/null
+++ b/ipc/kdbus/Makefile
@@ -0,0 +1,33 @@
+#
+# By setting KDBUS_EXT=2, the kdbus module will be built as kdbus2.ko, and
+# KBUILD_MODNAME=kdbus2. This has the effect that all exported objects have
+# different names than usually (kdbus2fs, /sys/fs/kdbus2/) and you can run
+# your test-infrastructure against the kdbus2.ko, while running your system
+# on kdbus.ko.
+#
+# To just build the module, use:
+# make KDBUS_EXT=2 M=ipc/kdbus
+#
+
+kdbus$(KDBUS_EXT)-y := \
+ bus.o \
+ connection.o \
+ endpoint.o \
+ fs.o \
+ handle.o \
+ item.o \
+ main.o \
+ match.o \
+ message.o \
+ metadata.o \
+ names.o \
+ node.o \
+ notify.o \
+ domain.o \
+ policy.o \
+ pool.o \
+ reply.o \
+ queue.o \
+ util.o
+
+obj-$(CONFIG_KDBUS) += kdbus$(KDBUS_EXT).o
diff --git a/ipc/kdbus/bus.c b/ipc/kdbus/bus.c
new file mode 100644
index 000000000..a67f825bd
--- /dev/null
+++ b/ipc/kdbus/bus.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "notify.h"
+#include "connection.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "match.h"
+#include "message.h"
+#include "metadata.h"
+#include "names.h"
+#include "policy.h"
+#include "util.h"
+
+static void kdbus_bus_free(struct kdbus_node *node)
+{
+ struct kdbus_bus *bus = container_of(node, struct kdbus_bus, node);
+
+ WARN_ON(!list_empty(&bus->monitors_list));
+ WARN_ON(!hash_empty(bus->conn_hash));
+
+ kdbus_notify_free(bus);
+
+ kdbus_user_unref(bus->creator);
+ kdbus_name_registry_free(bus->name_registry);
+ kdbus_domain_unref(bus->domain);
+ kdbus_policy_db_clear(&bus->policy_db);
+ kdbus_meta_proc_unref(bus->creator_meta);
+ kfree(bus);
+}
+
+static void kdbus_bus_release(struct kdbus_node *node, bool was_active)
+{
+ struct kdbus_bus *bus = container_of(node, struct kdbus_bus, node);
+
+ if (was_active)
+ atomic_dec(&bus->creator->buses);
+}
+
+static struct kdbus_bus *kdbus_bus_new(struct kdbus_domain *domain,
+ const char *name,
+ struct kdbus_bloom_parameter *bloom,
+ const u64 *pattach_owner,
+ u64 flags, kuid_t uid, kgid_t gid)
+{
+ struct kdbus_bus *b;
+ u64 attach_owner;
+ int ret;
+
+ if (bloom->size < 8 || bloom->size > KDBUS_BUS_BLOOM_MAX_SIZE ||
+ !KDBUS_IS_ALIGNED8(bloom->size) || bloom->n_hash < 1)
+ return ERR_PTR(-EINVAL);
+
+ ret = kdbus_sanitize_attach_flags(pattach_owner ? *pattach_owner : 0,
+ &attach_owner);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ ret = kdbus_verify_uid_prefix(name, domain->user_namespace, uid);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ b = kzalloc(sizeof(*b), GFP_KERNEL);
+ if (!b)
+ return ERR_PTR(-ENOMEM);
+
+ kdbus_node_init(&b->node, KDBUS_NODE_BUS);
+
+ b->node.free_cb = kdbus_bus_free;
+ b->node.release_cb = kdbus_bus_release;
+ b->node.uid = uid;
+ b->node.gid = gid;
+ b->node.mode = S_IRUSR | S_IXUSR;
+
+ if (flags & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD))
+ b->node.mode |= S_IRGRP | S_IXGRP;
+ if (flags & KDBUS_MAKE_ACCESS_WORLD)
+ b->node.mode |= S_IROTH | S_IXOTH;
+
+ b->id = atomic64_inc_return(&domain->last_id);
+ b->bus_flags = flags;
+ b->attach_flags_owner = attach_owner;
+ generate_random_uuid(b->id128);
+ b->bloom = *bloom;
+ b->domain = kdbus_domain_ref(domain);
+
+ kdbus_policy_db_init(&b->policy_db);
+
+ init_rwsem(&b->conn_rwlock);
+ hash_init(b->conn_hash);
+ INIT_LIST_HEAD(&b->monitors_list);
+
+ INIT_LIST_HEAD(&b->notify_list);
+ spin_lock_init(&b->notify_lock);
+ mutex_init(&b->notify_flush_lock);
+
+ ret = kdbus_node_link(&b->node, &domain->node, name);
+ if (ret < 0)
+ goto exit_unref;
+
+ /* cache the metadata/credentials of the creator */
+ b->creator_meta = kdbus_meta_proc_new();
+ if (IS_ERR(b->creator_meta)) {
+ ret = PTR_ERR(b->creator_meta);
+ b->creator_meta = NULL;
+ goto exit_unref;
+ }
+
+ ret = kdbus_meta_proc_collect(b->creator_meta,
+ KDBUS_ATTACH_CREDS |
+ KDBUS_ATTACH_PIDS |
+ KDBUS_ATTACH_AUXGROUPS |
+ KDBUS_ATTACH_TID_COMM |
+ KDBUS_ATTACH_PID_COMM |
+ KDBUS_ATTACH_EXE |
+ KDBUS_ATTACH_CMDLINE |
+ KDBUS_ATTACH_CGROUP |
+ KDBUS_ATTACH_CAPS |
+ KDBUS_ATTACH_SECLABEL |
+ KDBUS_ATTACH_AUDIT);
+ if (ret < 0)
+ goto exit_unref;
+
+ b->name_registry = kdbus_name_registry_new();
+ if (IS_ERR(b->name_registry)) {
+ ret = PTR_ERR(b->name_registry);
+ b->name_registry = NULL;
+ goto exit_unref;
+ }
+
+ /*
+ * Bus-limits of the creator are accounted on its real UID, just like
+ * all other per-user limits.
+ */
+ b->creator = kdbus_user_lookup(domain, current_uid());
+ if (IS_ERR(b->creator)) {
+ ret = PTR_ERR(b->creator);
+ b->creator = NULL;
+ goto exit_unref;
+ }
+
+ return b;
+
+exit_unref:
+ kdbus_node_deactivate(&b->node);
+ kdbus_node_unref(&b->node);
+ return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_bus_ref() - increase the reference counter of a kdbus_bus
+ * @bus: The bus to reference
+ *
+ * Every user of a bus, except for its creator, must add a reference to the
+ * kdbus_bus using this function.
+ *
+ * Return: the bus itself
+ */
+struct kdbus_bus *kdbus_bus_ref(struct kdbus_bus *bus)
+{
+ if (bus)
+ kdbus_node_ref(&bus->node);
+ return bus;
+}
+
+/**
+ * kdbus_bus_unref() - decrease the reference counter of a kdbus_bus
+ * @bus: The bus to unref
+ *
+ * Release a reference. If the reference count drops to 0, the bus will be
+ * freed.
+ *
+ * Return: NULL
+ */
+struct kdbus_bus *kdbus_bus_unref(struct kdbus_bus *bus)
+{
+ if (bus)
+ kdbus_node_unref(&bus->node);
+ return NULL;
+}
+
+/**
+ * kdbus_bus_find_conn_by_id() - find a connection with a given id
+ * @bus: The bus to look for the connection
+ * @id: The 64-bit connection id
+ *
+ * Looks up a connection with a given id. The returned connection
+ * is ref'ed, and needs to be unref'ed by the user. Returns NULL if
+ * the connection can't be found.
+ */
+struct kdbus_conn *kdbus_bus_find_conn_by_id(struct kdbus_bus *bus, u64 id)
+{
+ struct kdbus_conn *conn, *found = NULL;
+
+ down_read(&bus->conn_rwlock);
+ hash_for_each_possible(bus->conn_hash, conn, hentry, id)
+ if (conn->id == id) {
+ found = kdbus_conn_ref(conn);
+ break;
+ }
+ up_read(&bus->conn_rwlock);
+
+ return found;
+}
+
+/**
+ * kdbus_bus_broadcast() - send a message to all subscribed connections
+ * @bus: The bus the connections are connected to
+ * @conn_src: The source connection, may be %NULL for kernel notifications
+ * @staging: Staging object containing the message to send
+ *
+ * Send message to all connections that are currently active on the bus.
+ * Connections must still have matches installed in order to let the message
+ * pass.
+ *
+ * The caller must hold the name-registry lock of @bus.
+ */
+void kdbus_bus_broadcast(struct kdbus_bus *bus,
+ struct kdbus_conn *conn_src,
+ struct kdbus_staging *staging)
+{
+ struct kdbus_conn *conn_dst;
+ unsigned int i;
+ int ret;
+
+ lockdep_assert_held(&bus->name_registry->rwlock);
+
+ /*
+ * Make sure broadcast are queued on monitors before we send it out to
+ * anyone else. Otherwise, connections might react to broadcasts before
+ * the monitor gets the broadcast queued. In the worst case, the
+ * monitor sees a reaction to the broadcast before the broadcast itself.
+ * We don't give ordering guarantees across connections (and monitors
+ * can re-construct order via sequence numbers), but we should at least
+ * try to avoid re-ordering for monitors.
+ */
+ kdbus_bus_eavesdrop(bus, conn_src, staging);
+
+ down_read(&bus->conn_rwlock);
+ hash_for_each(bus->conn_hash, i, conn_dst, hentry) {
+ if (!kdbus_conn_is_ordinary(conn_dst))
+ continue;
+
+ /*
+ * Check if there is a match for the kmsg object in
+ * the destination connection match db
+ */
+ if (!kdbus_match_db_match_msg(conn_dst->match_db, conn_src,
+ staging))
+ continue;
+
+ if (conn_src) {
+ /*
+ * Anyone can send broadcasts, as they have no
+ * destination. But a receiver needs TALK access to
+ * the sender in order to receive broadcasts.
+ */
+ if (!kdbus_conn_policy_talk(conn_dst, NULL, conn_src))
+ continue;
+ } else {
+ /*
+ * Check if there is a policy db that prevents the
+ * destination connection from receiving this kernel
+ * notification
+ */
+ if (!kdbus_conn_policy_see_notification(conn_dst, NULL,
+ staging->msg))
+ continue;
+ }
+
+ ret = kdbus_conn_entry_insert(conn_src, conn_dst, staging,
+ NULL, NULL);
+ if (ret < 0)
+ kdbus_conn_lost_message(conn_dst);
+ }
+ up_read(&bus->conn_rwlock);
+}
+
+/**
+ * kdbus_bus_eavesdrop() - send a message to all subscribed monitors
+ * @bus: The bus the monitors are connected to
+ * @conn_src: The source connection, may be %NULL for kernel notifications
+ * @staging: Staging object containing the message to send
+ *
+ * Send message to all monitors that are currently active on the bus. Monitors
+ * must still have matches installed in order to let the message pass.
+ *
+ * The caller must hold the name-registry lock of @bus.
+ */
+void kdbus_bus_eavesdrop(struct kdbus_bus *bus,
+ struct kdbus_conn *conn_src,
+ struct kdbus_staging *staging)
+{
+ struct kdbus_conn *conn_dst;
+ int ret;
+
+ /*
+ * Monitor connections get all messages; ignore possible errors
+ * when sending messages to monitor connections.
+ */
+
+ lockdep_assert_held(&bus->name_registry->rwlock);
+
+ down_read(&bus->conn_rwlock);
+ list_for_each_entry(conn_dst, &bus->monitors_list, monitor_entry) {
+ ret = kdbus_conn_entry_insert(conn_src, conn_dst, staging,
+ NULL, NULL);
+ if (ret < 0)
+ kdbus_conn_lost_message(conn_dst);
+ }
+ up_read(&bus->conn_rwlock);
+}
+
+/**
+ * kdbus_cmd_bus_make() - handle KDBUS_CMD_BUS_MAKE
+ * @domain: domain to operate on
+ * @argp: command payload
+ *
+ * Return: NULL or newly created bus on success, ERR_PTR on failure.
+ */
+struct kdbus_bus *kdbus_cmd_bus_make(struct kdbus_domain *domain,
+ void __user *argp)
+{
+ struct kdbus_bus *bus = NULL;
+ struct kdbus_cmd *cmd;
+ struct kdbus_ep *ep = NULL;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_MAKE_NAME, .mandatory = true },
+ { .type = KDBUS_ITEM_BLOOM_PARAMETER, .mandatory = true },
+ { .type = KDBUS_ITEM_ATTACH_FLAGS_SEND },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE |
+ KDBUS_MAKE_ACCESS_GROUP |
+ KDBUS_MAKE_ACCESS_WORLD,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+
+ bus = kdbus_bus_new(domain,
+ argv[1].item->str, &argv[2].item->bloom_parameter,
+ argv[3].item ? argv[3].item->data64 : NULL,
+ cmd->flags, current_euid(), current_egid());
+ if (IS_ERR(bus)) {
+ ret = PTR_ERR(bus);
+ bus = NULL;
+ goto exit;
+ }
+
+ if (atomic_inc_return(&bus->creator->buses) > KDBUS_USER_MAX_BUSES) {
+ atomic_dec(&bus->creator->buses);
+ ret = -EMFILE;
+ goto exit;
+ }
+
+ if (!kdbus_node_activate(&bus->node)) {
+ atomic_dec(&bus->creator->buses);
+ ret = -ESHUTDOWN;
+ goto exit;
+ }
+
+ ep = kdbus_ep_new(bus, "bus", cmd->flags, bus->node.uid, bus->node.gid,
+ false);
+ if (IS_ERR(ep)) {
+ ret = PTR_ERR(ep);
+ ep = NULL;
+ goto exit;
+ }
+
+ if (!kdbus_node_activate(&ep->node)) {
+ ret = -ESHUTDOWN;
+ goto exit;
+ }
+
+ /*
+ * Drop our own reference, effectively causing the endpoint to be
+ * deactivated and released when the parent bus is.
+ */
+ ep = kdbus_ep_unref(ep);
+
+exit:
+ ret = kdbus_args_clear(&args, ret);
+ if (ret < 0) {
+ if (ep) {
+ kdbus_node_deactivate(&ep->node);
+ kdbus_ep_unref(ep);
+ }
+ if (bus) {
+ kdbus_node_deactivate(&bus->node);
+ kdbus_bus_unref(bus);
+ }
+ return ERR_PTR(ret);
+ }
+ return bus;
+}
+
+/**
+ * kdbus_cmd_bus_creator_info() - handle KDBUS_CMD_BUS_CREATOR_INFO
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_bus_creator_info(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_cmd_info *cmd;
+ struct kdbus_bus *bus = conn->ep->bus;
+ struct kdbus_pool_slice *slice = NULL;
+ struct kdbus_item *meta_items = NULL;
+ struct kdbus_item_header item_hdr;
+ struct kdbus_info info = {};
+ size_t meta_size, name_len, cnt = 0;
+ struct kvec kvec[6];
+ u64 attach_flags, size = 0;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ ret = kdbus_sanitize_attach_flags(cmd->attach_flags, &attach_flags);
+ if (ret < 0)
+ goto exit;
+
+ attach_flags &= bus->attach_flags_owner;
+
+ ret = kdbus_meta_emit(bus->creator_meta, NULL, NULL, conn,
+ attach_flags, &meta_items, &meta_size);
+ if (ret < 0)
+ goto exit;
+
+ name_len = strlen(bus->node.name) + 1;
+ info.id = bus->id;
+ info.flags = bus->bus_flags;
+ item_hdr.type = KDBUS_ITEM_MAKE_NAME;
+ item_hdr.size = KDBUS_ITEM_HEADER_SIZE + name_len;
+
+ kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &size);
+ kdbus_kvec_set(&kvec[cnt++], &item_hdr, sizeof(item_hdr), &size);
+ kdbus_kvec_set(&kvec[cnt++], bus->node.name, name_len, &size);
+ cnt += !!kdbus_kvec_pad(&kvec[cnt], &size);
+ if (meta_size > 0) {
+ kdbus_kvec_set(&kvec[cnt++], meta_items, meta_size, &size);
+ cnt += !!kdbus_kvec_pad(&kvec[cnt], &size);
+ }
+
+ info.size = size;
+
+ slice = kdbus_pool_slice_alloc(conn->pool, size, false);
+ if (IS_ERR(slice)) {
+ ret = PTR_ERR(slice);
+ slice = NULL;
+ goto exit;
+ }
+
+ ret = kdbus_pool_slice_copy_kvec(slice, 0, kvec, cnt, size);
+ if (ret < 0)
+ goto exit;
+
+ kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->info_size);
+
+ if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) ||
+ kdbus_member_set_user(&cmd->info_size, argp,
+ typeof(*cmd), info_size))
+ ret = -EFAULT;
+
+exit:
+ kdbus_pool_slice_release(slice);
+ kfree(meta_items);
+ return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/bus.h b/ipc/kdbus/bus.h
new file mode 100644
index 000000000..238986eff
--- /dev/null
+++ b/ipc/kdbus/bus.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_BUS_H
+#define __KDBUS_BUS_H
+
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <uapi/linux/kdbus.h>
+
+#include "metadata.h"
+#include "names.h"
+#include "node.h"
+#include "policy.h"
+
+struct kdbus_conn;
+struct kdbus_domain;
+struct kdbus_staging;
+struct kdbus_user;
+
+/**
+ * struct kdbus_bus - bus in a domain
+ * @node: kdbus_node
+ * @id: ID of this bus in the domain
+ * @bus_flags: Simple pass-through flags from userspace to userspace
+ * @attach_flags_owner: KDBUS_ATTACH_* flags of bus creator that other
+ * connections can see or query
+ * @id128: Unique random 128 bit ID of this bus
+ * @bloom: Bloom parameters
+ * @domain: Domain of this bus
+ * @creator: Creator of the bus
+ * @creator_meta: Meta information about the bus creator
+ * @policy_db: Policy database for this bus
+ * @name_registry: Name registry of this bus
+ * @conn_rwlock: Read/Write lock for all lists of child connections
+ * @conn_hash: Map of connection IDs
+ * @monitors_list: Connections that monitor this bus
+ * @notify_list: List of pending kernel-generated messages
+ * @notify_lock: Notification list lock
+ * @notify_flush_lock: Notification flushing lock
+ */
+struct kdbus_bus {
+ struct kdbus_node node;
+
+ /* static */
+ u64 id;
+ u64 bus_flags;
+ u64 attach_flags_owner;
+ u8 id128[16];
+ struct kdbus_bloom_parameter bloom;
+ struct kdbus_domain *domain;
+ struct kdbus_user *creator;
+ struct kdbus_meta_proc *creator_meta;
+
+ /* protected by own locks */
+ struct kdbus_policy_db policy_db;
+ struct kdbus_name_registry *name_registry;
+
+ /* protected by conn_rwlock */
+ struct rw_semaphore conn_rwlock;
+ DECLARE_HASHTABLE(conn_hash, 8);
+ struct list_head monitors_list;
+
+ /* protected by notify_lock */
+ struct list_head notify_list;
+ spinlock_t notify_lock;
+ struct mutex notify_flush_lock;
+};
+
+struct kdbus_bus *kdbus_bus_ref(struct kdbus_bus *bus);
+struct kdbus_bus *kdbus_bus_unref(struct kdbus_bus *bus);
+
+struct kdbus_conn *kdbus_bus_find_conn_by_id(struct kdbus_bus *bus, u64 id);
+void kdbus_bus_broadcast(struct kdbus_bus *bus,
+ struct kdbus_conn *conn_src,
+ struct kdbus_staging *staging);
+void kdbus_bus_eavesdrop(struct kdbus_bus *bus,
+ struct kdbus_conn *conn_src,
+ struct kdbus_staging *staging);
+
+struct kdbus_bus *kdbus_cmd_bus_make(struct kdbus_domain *domain,
+ void __user *argp);
+int kdbus_cmd_bus_creator_info(struct kdbus_conn *conn, void __user *argp);
+
+#endif
diff --git a/ipc/kdbus/connection.c b/ipc/kdbus/connection.c
new file mode 100644
index 000000000..d94b417e0
--- /dev/null
+++ b/ipc/kdbus/connection.c
@@ -0,0 +1,2207 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/audit.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/hashtable.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/math64.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/path.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/shmem_fs.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "match.h"
+#include "message.h"
+#include "metadata.h"
+#include "names.h"
+#include "domain.h"
+#include "item.h"
+#include "notify.h"
+#include "policy.h"
+#include "pool.h"
+#include "reply.h"
+#include "util.h"
+#include "queue.h"
+
+#define KDBUS_CONN_ACTIVE_BIAS (INT_MIN + 2)
+#define KDBUS_CONN_ACTIVE_NEW (INT_MIN + 1)
+
+static struct kdbus_conn *kdbus_conn_new(struct kdbus_ep *ep, bool privileged,
+ struct kdbus_cmd_hello *hello,
+ const char *name,
+ const struct kdbus_creds *creds,
+ const struct kdbus_pids *pids,
+ const char *seclabel,
+ const char *conn_description)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ static struct lock_class_key __key;
+#endif
+ struct kdbus_pool_slice *slice = NULL;
+ struct kdbus_bus *bus = ep->bus;
+ struct kdbus_conn *conn;
+ u64 attach_flags_send;
+ u64 attach_flags_recv;
+ u64 items_size = 0;
+ bool is_policy_holder;
+ bool is_activator;
+ bool is_monitor;
+ struct kvec kvec;
+ int ret;
+
+ struct {
+ u64 size;
+ u64 type;
+ struct kdbus_bloom_parameter bloom;
+ } bloom_item;
+
+ is_monitor = hello->flags & KDBUS_HELLO_MONITOR;
+ is_activator = hello->flags & KDBUS_HELLO_ACTIVATOR;
+ is_policy_holder = hello->flags & KDBUS_HELLO_POLICY_HOLDER;
+
+ if (!hello->pool_size || !IS_ALIGNED(hello->pool_size, PAGE_SIZE))
+ return ERR_PTR(-EINVAL);
+ if (is_monitor + is_activator + is_policy_holder > 1)
+ return ERR_PTR(-EINVAL);
+ if (name && !is_activator && !is_policy_holder)
+ return ERR_PTR(-EINVAL);
+ if (!name && (is_activator || is_policy_holder))
+ return ERR_PTR(-EINVAL);
+ if (name && !kdbus_name_is_valid(name, true))
+ return ERR_PTR(-EINVAL);
+ if (is_monitor && ep->user)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!privileged && (is_activator || is_policy_holder || is_monitor))
+ return ERR_PTR(-EPERM);
+ if ((creds || pids || seclabel) && !privileged)
+ return ERR_PTR(-EPERM);
+
+ ret = kdbus_sanitize_attach_flags(hello->attach_flags_send,
+ &attach_flags_send);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ ret = kdbus_sanitize_attach_flags(hello->attach_flags_recv,
+ &attach_flags_recv);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+ if (!conn)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&conn->kref);
+ atomic_set(&conn->active, KDBUS_CONN_ACTIVE_NEW);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_init_map(&conn->dep_map, "s_active", &__key, 0);
+#endif
+ mutex_init(&conn->lock);
+ INIT_LIST_HEAD(&conn->names_list);
+ INIT_LIST_HEAD(&conn->names_queue_list);
+ INIT_LIST_HEAD(&conn->reply_list);
+ atomic_set(&conn->name_count, 0);
+ atomic_set(&conn->request_count, 0);
+ atomic_set(&conn->lost_count, 0);
+ INIT_DELAYED_WORK(&conn->work, kdbus_reply_list_scan_work);
+ conn->cred = get_current_cred();
+ conn->pid = get_pid(task_pid(current));
+ get_fs_root(current->fs, &conn->root_path);
+ init_waitqueue_head(&conn->wait);
+ kdbus_queue_init(&conn->queue);
+ conn->privileged = privileged;
+ conn->ep = kdbus_ep_ref(ep);
+ conn->id = atomic64_inc_return(&bus->domain->last_id);
+ conn->flags = hello->flags;
+ atomic64_set(&conn->attach_flags_send, attach_flags_send);
+ atomic64_set(&conn->attach_flags_recv, attach_flags_recv);
+ INIT_LIST_HEAD(&conn->monitor_entry);
+
+ if (conn_description) {
+ conn->description = kstrdup(conn_description, GFP_KERNEL);
+ if (!conn->description) {
+ ret = -ENOMEM;
+ goto exit_unref;
+ }
+ }
+
+ conn->pool = kdbus_pool_new(conn->description, hello->pool_size);
+ if (IS_ERR(conn->pool)) {
+ ret = PTR_ERR(conn->pool);
+ conn->pool = NULL;
+ goto exit_unref;
+ }
+
+ conn->match_db = kdbus_match_db_new();
+ if (IS_ERR(conn->match_db)) {
+ ret = PTR_ERR(conn->match_db);
+ conn->match_db = NULL;
+ goto exit_unref;
+ }
+
+ /* return properties of this connection to the caller */
+ hello->bus_flags = bus->bus_flags;
+ hello->id = conn->id;
+
+ BUILD_BUG_ON(sizeof(bus->id128) != sizeof(hello->id128));
+ memcpy(hello->id128, bus->id128, sizeof(hello->id128));
+
+ /* privileged processes can impersonate somebody else */
+ if (creds || pids || seclabel) {
+ conn->meta_fake = kdbus_meta_fake_new();
+ if (IS_ERR(conn->meta_fake)) {
+ ret = PTR_ERR(conn->meta_fake);
+ conn->meta_fake = NULL;
+ goto exit_unref;
+ }
+
+ ret = kdbus_meta_fake_collect(conn->meta_fake,
+ creds, pids, seclabel);
+ if (ret < 0)
+ goto exit_unref;
+ } else {
+ conn->meta_proc = kdbus_meta_proc_new();
+ if (IS_ERR(conn->meta_proc)) {
+ ret = PTR_ERR(conn->meta_proc);
+ conn->meta_proc = NULL;
+ goto exit_unref;
+ }
+
+ ret = kdbus_meta_proc_collect(conn->meta_proc,
+ KDBUS_ATTACH_CREDS |
+ KDBUS_ATTACH_PIDS |
+ KDBUS_ATTACH_AUXGROUPS |
+ KDBUS_ATTACH_TID_COMM |
+ KDBUS_ATTACH_PID_COMM |
+ KDBUS_ATTACH_EXE |
+ KDBUS_ATTACH_CMDLINE |
+ KDBUS_ATTACH_CGROUP |
+ KDBUS_ATTACH_CAPS |
+ KDBUS_ATTACH_SECLABEL |
+ KDBUS_ATTACH_AUDIT);
+ if (ret < 0)
+ goto exit_unref;
+ }
+
+ /*
+ * Account the connection against the current user (UID), or for
+ * custom endpoints use the anonymous user assigned to the endpoint.
+ * Note that limits are always accounted against the real UID, not
+ * the effective UID (cred->user always points to the accounting of
+ * cred->uid, not cred->euid).
+ */
+ if (ep->user) {
+ conn->user = kdbus_user_ref(ep->user);
+ } else {
+ conn->user = kdbus_user_lookup(ep->bus->domain, current_uid());
+ if (IS_ERR(conn->user)) {
+ ret = PTR_ERR(conn->user);
+ conn->user = NULL;
+ goto exit_unref;
+ }
+ }
+
+ if (atomic_inc_return(&conn->user->connections) > KDBUS_USER_MAX_CONN) {
+ /* decremented by destructor as conn->user is valid */
+ ret = -EMFILE;
+ goto exit_unref;
+ }
+
+ bloom_item.size = sizeof(bloom_item);
+ bloom_item.type = KDBUS_ITEM_BLOOM_PARAMETER;
+ bloom_item.bloom = bus->bloom;
+ kdbus_kvec_set(&kvec, &bloom_item, bloom_item.size, &items_size);
+
+ slice = kdbus_pool_slice_alloc(conn->pool, items_size, false);
+ if (IS_ERR(slice)) {
+ ret = PTR_ERR(slice);
+ slice = NULL;
+ goto exit_unref;
+ }
+
+ ret = kdbus_pool_slice_copy_kvec(slice, 0, &kvec, 1, items_size);
+ if (ret < 0)
+ goto exit_unref;
+
+ kdbus_pool_slice_publish(slice, &hello->offset, &hello->items_size);
+ kdbus_pool_slice_release(slice);
+
+ return conn;
+
+exit_unref:
+ kdbus_pool_slice_release(slice);
+ kdbus_conn_unref(conn);
+ return ERR_PTR(ret);
+}
+
+static void __kdbus_conn_free(struct kref *kref)
+{
+ struct kdbus_conn *conn = container_of(kref, struct kdbus_conn, kref);
+
+ WARN_ON(kdbus_conn_active(conn));
+ WARN_ON(delayed_work_pending(&conn->work));
+ WARN_ON(!list_empty(&conn->queue.msg_list));
+ WARN_ON(!list_empty(&conn->names_list));
+ WARN_ON(!list_empty(&conn->names_queue_list));
+ WARN_ON(!list_empty(&conn->reply_list));
+
+ if (conn->user) {
+ atomic_dec(&conn->user->connections);
+ kdbus_user_unref(conn->user);
+ }
+
+ kdbus_meta_fake_free(conn->meta_fake);
+ kdbus_meta_proc_unref(conn->meta_proc);
+ kdbus_match_db_free(conn->match_db);
+ kdbus_pool_free(conn->pool);
+ kdbus_ep_unref(conn->ep);
+ path_put(&conn->root_path);
+ put_pid(conn->pid);
+ put_cred(conn->cred);
+ kfree(conn->description);
+ kfree(conn->quota);
+ kfree(conn);
+}
+
+/**
+ * kdbus_conn_ref() - take a connection reference
+ * @conn: Connection, may be %NULL
+ *
+ * Return: the connection itself
+ */
+struct kdbus_conn *kdbus_conn_ref(struct kdbus_conn *conn)
+{
+ if (conn)
+ kref_get(&conn->kref);
+ return conn;
+}
+
+/**
+ * kdbus_conn_unref() - drop a connection reference
+ * @conn: Connection (may be NULL)
+ *
+ * When the last reference is dropped, the connection's internal structure
+ * is freed.
+ *
+ * Return: NULL
+ */
+struct kdbus_conn *kdbus_conn_unref(struct kdbus_conn *conn)
+{
+ if (conn)
+ kref_put(&conn->kref, __kdbus_conn_free);
+ return NULL;
+}
+
+/**
+ * kdbus_conn_active() - connection is not disconnected
+ * @conn: Connection to check
+ *
+ * Return true if the connection was not disconnected, yet. Note that a
+ * connection might be disconnected asynchronously, unless you hold the
+ * connection lock. If that's not suitable for you, see kdbus_conn_acquire() to
+ * suppress connection shutdown for a short period.
+ *
+ * Return: true if the connection is still active
+ */
+bool kdbus_conn_active(const struct kdbus_conn *conn)
+{
+ return atomic_read(&conn->active) >= 0;
+}
+
+/**
+ * kdbus_conn_acquire() - acquire an active connection reference
+ * @conn: Connection
+ *
+ * Users can close a connection via KDBUS_BYEBYE (or by destroying the
+ * endpoint/bus/...) at any time. Whenever this happens, we should deny any
+ * user-visible action on this connection and signal ECONNRESET instead.
+ * To avoid testing for connection availability everytime you take the
+ * connection-lock, you can acquire a connection for short periods.
+ *
+ * By calling kdbus_conn_acquire(), you gain an "active reference" to the
+ * connection. You must also hold a regular reference at any time! As long as
+ * you hold the active-ref, the connection will not be shut down. However, if
+ * the connection was shut down, you can never acquire an active-ref again.
+ *
+ * kdbus_conn_disconnect() disables the connection and then waits for all active
+ * references to be dropped. It will also wake up any pending operation.
+ * However, you must not sleep for an indefinite period while holding an
+ * active-reference. Otherwise, kdbus_conn_disconnect() might stall. If you need
+ * to sleep for an indefinite period, either release the reference and try to
+ * acquire it again after waking up, or make kdbus_conn_disconnect() wake up
+ * your wait-queue.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_conn_acquire(struct kdbus_conn *conn)
+{
+ if (!atomic_inc_unless_negative(&conn->active))
+ return -ECONNRESET;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_acquire_read(&conn->dep_map, 0, 1, _RET_IP_);
+#endif
+
+ return 0;
+}
+
+/**
+ * kdbus_conn_release() - release an active connection reference
+ * @conn: Connection
+ *
+ * This releases an active reference that has been acquired via
+ * kdbus_conn_acquire(). If the connection was already disabled and this is the
+ * last active-ref that is dropped, the disconnect-waiter will be woken up and
+ * properly close the connection.
+ */
+void kdbus_conn_release(struct kdbus_conn *conn)
+{
+ int v;
+
+ if (!conn)
+ return;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_release(&conn->dep_map, 1, _RET_IP_);
+#endif
+
+ v = atomic_dec_return(&conn->active);
+ if (v != KDBUS_CONN_ACTIVE_BIAS)
+ return;
+
+ wake_up_all(&conn->wait);
+}
+
+static int kdbus_conn_connect(struct kdbus_conn *conn, const char *name)
+{
+ struct kdbus_ep *ep = conn->ep;
+ struct kdbus_bus *bus = ep->bus;
+ int ret;
+
+ if (WARN_ON(atomic_read(&conn->active) != KDBUS_CONN_ACTIVE_NEW))
+ return -EALREADY;
+
+ /* make sure the ep-node is active while we add our connection */
+ if (!kdbus_node_acquire(&ep->node))
+ return -ESHUTDOWN;
+
+ /* lock order: domain -> bus -> ep -> names -> conn */
+ mutex_lock(&ep->lock);
+ down_write(&bus->conn_rwlock);
+
+ /* link into monitor list */
+ if (kdbus_conn_is_monitor(conn))
+ list_add_tail(&conn->monitor_entry, &bus->monitors_list);
+
+ /* link into bus and endpoint */
+ list_add_tail(&conn->ep_entry, &ep->conn_list);
+ hash_add(bus->conn_hash, &conn->hentry, conn->id);
+
+ /* enable lookups and acquire active ref */
+ atomic_set(&conn->active, 1);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_acquire_read(&conn->dep_map, 0, 1, _RET_IP_);
+#endif
+
+ up_write(&bus->conn_rwlock);
+ mutex_unlock(&ep->lock);
+
+ kdbus_node_release(&ep->node);
+
+ /*
+ * Notify subscribers about the new active connection, unless it is
+ * a monitor. Monitors are invisible on the bus, can't be addressed
+ * directly, and won't cause any notifications.
+ */
+ if (!kdbus_conn_is_monitor(conn)) {
+ ret = kdbus_notify_id_change(bus, KDBUS_ITEM_ID_ADD,
+ conn->id, conn->flags);
+ if (ret < 0)
+ goto exit_disconnect;
+ }
+
+ if (kdbus_conn_is_activator(conn)) {
+ u64 flags = KDBUS_NAME_ACTIVATOR;
+
+ if (WARN_ON(!name)) {
+ ret = -EINVAL;
+ goto exit_disconnect;
+ }
+
+ ret = kdbus_name_acquire(bus->name_registry, conn, name,
+ flags, NULL);
+ if (ret < 0)
+ goto exit_disconnect;
+ }
+
+ kdbus_conn_release(conn);
+ kdbus_notify_flush(bus);
+ return 0;
+
+exit_disconnect:
+ kdbus_conn_release(conn);
+ kdbus_conn_disconnect(conn, false);
+ return ret;
+}
+
+/**
+ * kdbus_conn_disconnect() - disconnect a connection
+ * @conn: The connection to disconnect
+ * @ensure_queue_empty: Flag to indicate if the call should fail in
+ * case the connection's message list is not
+ * empty
+ *
+ * If @ensure_msg_list_empty is true, and the connection has pending messages,
+ * -EBUSY is returned.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+int kdbus_conn_disconnect(struct kdbus_conn *conn, bool ensure_queue_empty)
+{
+ struct kdbus_queue_entry *entry, *tmp;
+ struct kdbus_bus *bus = conn->ep->bus;
+ struct kdbus_reply *r, *r_tmp;
+ struct kdbus_conn *c;
+ int i, v;
+
+ mutex_lock(&conn->lock);
+ v = atomic_read(&conn->active);
+ if (v == KDBUS_CONN_ACTIVE_NEW) {
+ /* was never connected */
+ mutex_unlock(&conn->lock);
+ return 0;
+ }
+ if (v < 0) {
+ /* already dead */
+ mutex_unlock(&conn->lock);
+ return -ECONNRESET;
+ }
+ if (ensure_queue_empty && !list_empty(&conn->queue.msg_list)) {
+ /* still busy */
+ mutex_unlock(&conn->lock);
+ return -EBUSY;
+ }
+
+ atomic_add(KDBUS_CONN_ACTIVE_BIAS, &conn->active);
+ mutex_unlock(&conn->lock);
+
+ wake_up_interruptible(&conn->wait);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_acquire(&conn->dep_map, 0, 0, _RET_IP_);
+ if (atomic_read(&conn->active) != KDBUS_CONN_ACTIVE_BIAS)
+ lock_contended(&conn->dep_map, _RET_IP_);
+#endif
+
+ wait_event(conn->wait,
+ atomic_read(&conn->active) == KDBUS_CONN_ACTIVE_BIAS);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lock_acquired(&conn->dep_map, _RET_IP_);
+ rwsem_release(&conn->dep_map, 1, _RET_IP_);
+#endif
+
+ cancel_delayed_work_sync(&conn->work);
+ kdbus_policy_remove_owner(&conn->ep->bus->policy_db, conn);
+
+ /* lock order: domain -> bus -> ep -> names -> conn */
+ mutex_lock(&conn->ep->lock);
+ down_write(&bus->conn_rwlock);
+
+ /* remove from bus and endpoint */
+ hash_del(&conn->hentry);
+ list_del(&conn->monitor_entry);
+ list_del(&conn->ep_entry);
+
+ up_write(&bus->conn_rwlock);
+ mutex_unlock(&conn->ep->lock);
+
+ /*
+ * Remove all names associated with this connection; this possibly
+ * moves queued messages back to the activator connection.
+ */
+ kdbus_name_release_all(bus->name_registry, conn);
+
+ /* if we die while other connections wait for our reply, notify them */
+ mutex_lock(&conn->lock);
+ list_for_each_entry_safe(entry, tmp, &conn->queue.msg_list, entry) {
+ if (entry->reply)
+ kdbus_notify_reply_dead(bus,
+ entry->reply->reply_dst->id,
+ entry->reply->cookie);
+ kdbus_queue_entry_free(entry);
+ }
+
+ list_for_each_entry_safe(r, r_tmp, &conn->reply_list, entry)
+ kdbus_reply_unlink(r);
+ mutex_unlock(&conn->lock);
+
+ /* lock order: domain -> bus -> ep -> names -> conn */
+ down_read(&bus->conn_rwlock);
+ hash_for_each(bus->conn_hash, i, c, hentry) {
+ mutex_lock(&c->lock);
+ list_for_each_entry_safe(r, r_tmp, &c->reply_list, entry) {
+ if (r->reply_src != conn)
+ continue;
+
+ if (r->sync)
+ kdbus_sync_reply_wakeup(r, -EPIPE);
+ else
+ /* send a 'connection dead' notification */
+ kdbus_notify_reply_dead(bus, c->id, r->cookie);
+
+ kdbus_reply_unlink(r);
+ }
+ mutex_unlock(&c->lock);
+ }
+ up_read(&bus->conn_rwlock);
+
+ if (!kdbus_conn_is_monitor(conn))
+ kdbus_notify_id_change(bus, KDBUS_ITEM_ID_REMOVE,
+ conn->id, conn->flags);
+
+ kdbus_notify_flush(bus);
+
+ return 0;
+}
+
+/**
+ * kdbus_conn_has_name() - check if a connection owns a name
+ * @conn: Connection
+ * @name: Well-know name to check for
+ *
+ * The caller must hold the registry lock of conn->ep->bus.
+ *
+ * Return: true if the name is currently owned by the connection
+ */
+bool kdbus_conn_has_name(struct kdbus_conn *conn, const char *name)
+{
+ struct kdbus_name_entry *e;
+
+ lockdep_assert_held(&conn->ep->bus->name_registry->rwlock);
+
+ list_for_each_entry(e, &conn->names_list, conn_entry)
+ if (strcmp(e->name, name) == 0)
+ return true;
+
+ return false;
+}
+
+struct kdbus_quota {
+ u32 memory;
+ u16 msgs;
+ u8 fds;
+};
+
+/**
+ * kdbus_conn_quota_inc() - increase quota accounting
+ * @c: connection owning the quota tracking
+ * @u: user to account for (or NULL for kernel accounting)
+ * @memory: size of memory to account for
+ * @fds: number of FDs to account for
+ *
+ * This call manages the quotas on resource @c. That is, it's used if other
+ * users want to use the resources of connection @c, which so far only concerns
+ * the receive queue of the destination.
+ *
+ * This increases the quota-accounting for user @u by @memory bytes and @fds
+ * file descriptors. If the user has already reached the quota limits, this call
+ * will not do any accounting but return a negative error code indicating the
+ * failure.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_conn_quota_inc(struct kdbus_conn *c, struct kdbus_user *u,
+ size_t memory, size_t fds)
+{
+ struct kdbus_quota *quota;
+ size_t available, accounted;
+ unsigned int id;
+
+ /*
+ * Pool Layout:
+ * 50% of a pool is always owned by the connection. It is reserved for
+ * kernel queries, handling received messages and other tasks that are
+ * under control of the pool owner. The other 50% of the pool are used
+ * as incoming queue.
+ * As we optionally support user-space based policies, we need fair
+ * allocation schemes. Furthermore, resource utilization should be
+ * maximized, so only minimal resources stay reserved. However, we need
+ * to adapt to a dynamic number of users, as we cannot know how many
+ * users will talk to a connection. Therefore, the current allocation
+ * works like this:
+ * We limit the number of bytes in a destination's pool per sending
+ * user. The space available for a user is 33% of the unused pool space
+ * (whereas the space used by the user itself is also treated as
+ * 'unused'). This way, we favor users coming first, but keep enough
+ * pool space available for any following users. Given that messages are
+ * dequeued in FIFO order, this should balance nicely if the number of
+ * users grows. At the same time, this algorithm guarantees that the
+ * space available to a connection is reduced dynamically, the more
+ * concurrent users talk to a connection.
+ */
+
+ /* per user-accounting is expensive, so we keep state small */
+ BUILD_BUG_ON(sizeof(quota->memory) != 4);
+ BUILD_BUG_ON(sizeof(quota->msgs) != 2);
+ BUILD_BUG_ON(sizeof(quota->fds) != 1);
+ BUILD_BUG_ON(KDBUS_CONN_MAX_MSGS > U16_MAX);
+ BUILD_BUG_ON(KDBUS_CONN_MAX_FDS_PER_USER > U8_MAX);
+
+ id = u ? u->id : KDBUS_USER_KERNEL_ID;
+ if (id >= c->n_quota) {
+ unsigned int users;
+
+ users = max(KDBUS_ALIGN8(id) + 8, id);
+ quota = krealloc(c->quota, users * sizeof(*quota),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!quota)
+ return -ENOMEM;
+
+ c->n_quota = users;
+ c->quota = quota;
+ }
+
+ quota = &c->quota[id];
+ kdbus_pool_accounted(c->pool, &available, &accounted);
+
+ /* half the pool is _always_ reserved for the pool owner */
+ available /= 2;
+
+ /*
+ * Pool owner slices are un-accounted slices; they can claim more
+ * than 50% of the queue. However, the slices we're dealing with here
+ * belong to the incoming queue, hence they are 'accounted' slices
+ * to which the 50%-limit applies.
+ */
+ if (available < accounted)
+ return -ENOBUFS;
+
+ /* 1/3 of the remaining space (including your own memory) */
+ available = (available - accounted + quota->memory) / 3;
+
+ if (available < quota->memory ||
+ available - quota->memory < memory ||
+ quota->memory + memory > U32_MAX)
+ return -ENOBUFS;
+ if (quota->msgs >= KDBUS_CONN_MAX_MSGS)
+ return -ENOBUFS;
+ if (quota->fds + fds < quota->fds ||
+ quota->fds + fds > KDBUS_CONN_MAX_FDS_PER_USER)
+ return -EMFILE;
+
+ quota->memory += memory;
+ quota->fds += fds;
+ ++quota->msgs;
+ return 0;
+}
+
+/**
+ * kdbus_conn_quota_dec() - decrease quota accounting
+ * @c: connection owning the quota tracking
+ * @u: user which was accounted for (or NULL for kernel accounting)
+ * @memory: size of memory which was accounted for
+ * @fds: number of FDs which were accounted for
+ *
+ * This does the reverse of kdbus_conn_quota_inc(). You have to release any
+ * accounted resources that you called kdbus_conn_quota_inc() for. However, you
+ * must not call kdbus_conn_quota_dec() if the accounting failed (that is,
+ * kdbus_conn_quota_inc() failed).
+ */
+void kdbus_conn_quota_dec(struct kdbus_conn *c, struct kdbus_user *u,
+ size_t memory, size_t fds)
+{
+ struct kdbus_quota *quota;
+ unsigned int id;
+
+ id = u ? u->id : KDBUS_USER_KERNEL_ID;
+ if (WARN_ON(id >= c->n_quota))
+ return;
+
+ quota = &c->quota[id];
+
+ if (!WARN_ON(quota->msgs == 0))
+ --quota->msgs;
+ if (!WARN_ON(quota->memory < memory))
+ quota->memory -= memory;
+ if (!WARN_ON(quota->fds < fds))
+ quota->fds -= fds;
+}
+
+/**
+ * kdbus_conn_lost_message() - handle lost messages
+ * @c: connection that lost a message
+ *
+ * kdbus is reliable. That means, we try hard to never lose messages. However,
+ * memory is limited, so we cannot rely on transmissions to never fail.
+ * Therefore, we use quota-limits to let callers know if their unicast message
+ * cannot be transmitted to a peer. This works fine for unicasts, but for
+ * broadcasts we cannot make the caller handle the transmission failure.
+ * Instead, we must let the destination know that it couldn't receive a
+ * broadcast.
+ * As this is an unlikely scenario, we keep it simple. A single lost-counter
+ * remembers the number of lost messages since the last call to RECV. The next
+ * message retrieval will notify the connection that it lost messages since the
+ * last message retrieval and thus should resync its state.
+ */
+void kdbus_conn_lost_message(struct kdbus_conn *c)
+{
+ if (atomic_inc_return(&c->lost_count) == 1)
+ wake_up_interruptible(&c->wait);
+}
+
+/* Callers should take the conn_dst lock */
+static struct kdbus_queue_entry *
+kdbus_conn_entry_make(struct kdbus_conn *conn_src,
+ struct kdbus_conn *conn_dst,
+ struct kdbus_staging *staging)
+{
+ /* The remote connection was disconnected */
+ if (!kdbus_conn_active(conn_dst))
+ return ERR_PTR(-ECONNRESET);
+
+ /*
+ * If the connection does not accept file descriptors but the message
+ * has some attached, refuse it.
+ *
+ * If this is a monitor connection, accept the message. In that
+ * case, all file descriptors will be set to -1 at receive time.
+ */
+ if (!kdbus_conn_is_monitor(conn_dst) &&
+ !(conn_dst->flags & KDBUS_HELLO_ACCEPT_FD) &&
+ staging->gaps && staging->gaps->n_fds > 0)
+ return ERR_PTR(-ECOMM);
+
+ return kdbus_queue_entry_new(conn_src, conn_dst, staging);
+}
+
+/*
+ * Synchronously responding to a message, allocate a queue entry
+ * and attach it to the reply tracking object.
+ * The connection's queue will never get to see it.
+ */
+static int kdbus_conn_entry_sync_attach(struct kdbus_conn *conn_dst,
+ struct kdbus_staging *staging,
+ struct kdbus_reply *reply_wake)
+{
+ struct kdbus_queue_entry *entry;
+ int remote_ret, ret = 0;
+
+ mutex_lock(&reply_wake->reply_dst->lock);
+
+ /*
+ * If we are still waiting then proceed, allocate a queue
+ * entry and attach it to the reply object
+ */
+ if (reply_wake->waiting) {
+ entry = kdbus_conn_entry_make(reply_wake->reply_src, conn_dst,
+ staging);
+ if (IS_ERR(entry))
+ ret = PTR_ERR(entry);
+ else
+ /* Attach the entry to the reply object */
+ reply_wake->queue_entry = entry;
+ } else {
+ ret = -ECONNRESET;
+ }
+
+ /*
+ * Update the reply object and wake up remote peer only
+ * on appropriate return codes
+ *
+ * * -ECOMM: if the replying connection failed with -ECOMM
+ * then wakeup remote peer with -EREMOTEIO
+ *
+ * We do this to differenciate between -ECOMM errors
+ * from the original sender perspective:
+ * -ECOMM error during the sync send and
+ * -ECOMM error during the sync reply, this last
+ * one is rewritten to -EREMOTEIO
+ *
+ * * Wake up on all other return codes.
+ */
+ remote_ret = ret;
+
+ if (ret == -ECOMM)
+ remote_ret = -EREMOTEIO;
+
+ kdbus_sync_reply_wakeup(reply_wake, remote_ret);
+ kdbus_reply_unlink(reply_wake);
+ mutex_unlock(&reply_wake->reply_dst->lock);
+
+ return ret;
+}
+
+/**
+ * kdbus_conn_entry_insert() - enqueue a message into the receiver's pool
+ * @conn_src: The sending connection
+ * @conn_dst: The connection to queue into
+ * @staging: Message to send
+ * @reply: The reply tracker to attach to the queue entry
+ * @name: Destination name this msg is sent to, or NULL
+ *
+ * Return: 0 on success. negative error otherwise.
+ */
+int kdbus_conn_entry_insert(struct kdbus_conn *conn_src,
+ struct kdbus_conn *conn_dst,
+ struct kdbus_staging *staging,
+ struct kdbus_reply *reply,
+ const struct kdbus_name_entry *name)
+{
+ struct kdbus_queue_entry *entry;
+ int ret;
+
+ kdbus_conn_lock2(conn_src, conn_dst);
+
+ entry = kdbus_conn_entry_make(conn_src, conn_dst, staging);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ goto exit_unlock;
+ }
+
+ if (reply) {
+ kdbus_reply_link(reply);
+ if (!reply->sync)
+ schedule_delayed_work(&conn_src->work, 0);
+ }
+
+ /*
+ * Record the sequence number of the registered name; it will
+ * be remembered by the queue, in case messages addressed to a
+ * name need to be moved from or to an activator.
+ */
+ if (name)
+ entry->dst_name_id = name->name_id;
+
+ kdbus_queue_entry_enqueue(entry, reply);
+ wake_up_interruptible(&conn_dst->wait);
+
+ ret = 0;
+
+exit_unlock:
+ kdbus_conn_unlock2(conn_src, conn_dst);
+ return ret;
+}
+
+static int kdbus_conn_wait_reply(struct kdbus_conn *conn_src,
+ struct kdbus_cmd_send *cmd_send,
+ struct file *ioctl_file,
+ struct file *cancel_fd,
+ struct kdbus_reply *reply_wait,
+ ktime_t expire)
+{
+ struct kdbus_queue_entry *entry;
+ struct poll_wqueues pwq = {};
+ int ret;
+
+ if (WARN_ON(!reply_wait))
+ return -EIO;
+
+ /*
+ * Block until the reply arrives. reply_wait is left untouched
+ * by the timeout scans that might be conducted for other,
+ * asynchronous replies of conn_src.
+ */
+
+ poll_initwait(&pwq);
+ poll_wait(ioctl_file, &conn_src->wait, &pwq.pt);
+
+ for (;;) {
+ /*
+ * Any of the following conditions will stop our synchronously
+ * blocking SEND command:
+ *
+ * a) The origin sender closed its connection
+ * b) The remote peer answered, setting reply_wait->waiting = 0
+ * c) The cancel FD was written to
+ * d) A signal was received
+ * e) The specified timeout was reached, and none of the above
+ * conditions kicked in.
+ */
+
+ /*
+ * We have already acquired an active reference when
+ * entering here, but another thread may call
+ * KDBUS_CMD_BYEBYE which does not acquire an active
+ * reference, therefore kdbus_conn_disconnect() will
+ * not wait for us.
+ */
+ if (!kdbus_conn_active(conn_src)) {
+ ret = -ECONNRESET;
+ break;
+ }
+
+ /*
+ * After the replying peer unset the waiting variable
+ * it will wake up us.
+ */
+ if (!reply_wait->waiting) {
+ ret = reply_wait->err;
+ break;
+ }
+
+ if (cancel_fd) {
+ unsigned int r;
+
+ r = cancel_fd->f_op->poll(cancel_fd, &pwq.pt);
+ if (r & POLLIN) {
+ ret = -ECANCELED;
+ break;
+ }
+ }
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (!poll_schedule_timeout(&pwq, TASK_INTERRUPTIBLE,
+ &expire, 0)) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+
+ /*
+ * Reset the poll worker func, so the waitqueues are not
+ * added to the poll table again. We just reuse what we've
+ * collected earlier for further iterations.
+ */
+ init_poll_funcptr(&pwq.pt, NULL);
+ }
+
+ poll_freewait(&pwq);
+
+ if (ret == -EINTR) {
+ /*
+ * Interrupted system call. Unref the reply object, and pass
+ * the return value down the chain. Mark the reply as
+ * interrupted, so the cleanup work can remove it, but do not
+ * unlink it from the list. Once the syscall restarts, we'll
+ * pick it up and wait on it again.
+ */
+ mutex_lock(&conn_src->lock);
+ reply_wait->interrupted = true;
+ schedule_delayed_work(&conn_src->work, 0);
+ mutex_unlock(&conn_src->lock);
+
+ return -ERESTARTSYS;
+ }
+
+ mutex_lock(&conn_src->lock);
+ reply_wait->waiting = false;
+ entry = reply_wait->queue_entry;
+ if (entry) {
+ ret = kdbus_queue_entry_install(entry,
+ &cmd_send->reply.return_flags,
+ true);
+ kdbus_pool_slice_publish(entry->slice, &cmd_send->reply.offset,
+ &cmd_send->reply.msg_size);
+ kdbus_queue_entry_free(entry);
+ }
+ kdbus_reply_unlink(reply_wait);
+ mutex_unlock(&conn_src->lock);
+
+ return ret;
+}
+
+static int kdbus_pin_dst(struct kdbus_bus *bus,
+ struct kdbus_staging *staging,
+ struct kdbus_name_entry **out_name,
+ struct kdbus_conn **out_dst)
+{
+ const struct kdbus_msg *msg = staging->msg;
+ struct kdbus_name_entry *name = NULL;
+ struct kdbus_conn *dst = NULL;
+ int ret;
+
+ lockdep_assert_held(&bus->name_registry->rwlock);
+
+ if (!staging->dst_name) {
+ dst = kdbus_bus_find_conn_by_id(bus, msg->dst_id);
+ if (!dst)
+ return -ENXIO;
+
+ if (!kdbus_conn_is_ordinary(dst)) {
+ ret = -ENXIO;
+ goto error;
+ }
+ } else {
+ name = kdbus_name_lookup_unlocked(bus->name_registry,
+ staging->dst_name);
+ if (!name)
+ return -ESRCH;
+
+ /*
+ * If both a name and a connection ID are given as destination
+ * of a message, check that the currently owning connection of
+ * the name matches the specified ID.
+ * This way, we allow userspace to send the message to a
+ * specific connection by ID only if the connection currently
+ * owns the given name.
+ */
+ if (msg->dst_id != KDBUS_DST_ID_NAME &&
+ msg->dst_id != name->conn->id)
+ return -EREMCHG;
+
+ if (!name->conn && name->activator)
+ dst = kdbus_conn_ref(name->activator);
+ else
+ dst = kdbus_conn_ref(name->conn);
+
+ if ((msg->flags & KDBUS_MSG_NO_AUTO_START) &&
+ kdbus_conn_is_activator(dst)) {
+ ret = -EADDRNOTAVAIL;
+ goto error;
+ }
+ }
+
+ *out_name = name;
+ *out_dst = dst;
+ return 0;
+
+error:
+ kdbus_conn_unref(dst);
+ return ret;
+}
+
+static int kdbus_conn_reply(struct kdbus_conn *src,
+ struct kdbus_staging *staging)
+{
+ const struct kdbus_msg *msg = staging->msg;
+ struct kdbus_name_entry *name = NULL;
+ struct kdbus_reply *reply, *wake = NULL;
+ struct kdbus_conn *dst = NULL;
+ struct kdbus_bus *bus = src->ep->bus;
+ int ret;
+
+ if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) ||
+ WARN_ON(msg->flags & KDBUS_MSG_EXPECT_REPLY) ||
+ WARN_ON(msg->flags & KDBUS_MSG_SIGNAL))
+ return -EINVAL;
+
+ /* name-registry must be locked for lookup *and* collecting data */
+ down_read(&bus->name_registry->rwlock);
+
+ /* find and pin destination */
+
+ ret = kdbus_pin_dst(bus, staging, &name, &dst);
+ if (ret < 0)
+ goto exit;
+
+ mutex_lock(&dst->lock);
+ reply = kdbus_reply_find(src, dst, msg->cookie_reply);
+ if (reply) {
+ if (reply->sync)
+ wake = kdbus_reply_ref(reply);
+ kdbus_reply_unlink(reply);
+ }
+ mutex_unlock(&dst->lock);
+
+ if (!reply) {
+ ret = -EPERM;
+ goto exit;
+ }
+
+ /* send message */
+
+ kdbus_bus_eavesdrop(bus, src, staging);
+
+ if (wake)
+ ret = kdbus_conn_entry_sync_attach(dst, staging, wake);
+ else
+ ret = kdbus_conn_entry_insert(src, dst, staging, NULL, name);
+
+exit:
+ up_read(&bus->name_registry->rwlock);
+ kdbus_reply_unref(wake);
+ kdbus_conn_unref(dst);
+ return ret;
+}
+
+static struct kdbus_reply *kdbus_conn_call(struct kdbus_conn *src,
+ struct kdbus_staging *staging,
+ ktime_t exp)
+{
+ const struct kdbus_msg *msg = staging->msg;
+ struct kdbus_name_entry *name = NULL;
+ struct kdbus_reply *wait = NULL;
+ struct kdbus_conn *dst = NULL;
+ struct kdbus_bus *bus = src->ep->bus;
+ int ret;
+
+ if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) ||
+ WARN_ON(msg->flags & KDBUS_MSG_SIGNAL) ||
+ WARN_ON(!(msg->flags & KDBUS_MSG_EXPECT_REPLY)))
+ return ERR_PTR(-EINVAL);
+
+ /* resume previous wait-context, if available */
+
+ mutex_lock(&src->lock);
+ wait = kdbus_reply_find(NULL, src, msg->cookie);
+ if (wait) {
+ if (wait->interrupted) {
+ kdbus_reply_ref(wait);
+ wait->interrupted = false;
+ } else {
+ wait = NULL;
+ }
+ }
+ mutex_unlock(&src->lock);
+
+ if (wait)
+ return wait;
+
+ if (ktime_compare(ktime_get(), exp) >= 0)
+ return ERR_PTR(-ETIMEDOUT);
+
+ /* name-registry must be locked for lookup *and* collecting data */
+ down_read(&bus->name_registry->rwlock);
+
+ /* find and pin destination */
+
+ ret = kdbus_pin_dst(bus, staging, &name, &dst);
+ if (ret < 0)
+ goto exit;
+
+ if (!kdbus_conn_policy_talk(src, current_cred(), dst)) {
+ ret = -EPERM;
+ goto exit;
+ }
+
+ wait = kdbus_reply_new(dst, src, msg, name, true);
+ if (IS_ERR(wait)) {
+ ret = PTR_ERR(wait);
+ wait = NULL;
+ goto exit;
+ }
+
+ /* send message */
+
+ kdbus_bus_eavesdrop(bus, src, staging);
+
+ ret = kdbus_conn_entry_insert(src, dst, staging, wait, name);
+ if (ret < 0)
+ goto exit;
+
+ ret = 0;
+
+exit:
+ up_read(&bus->name_registry->rwlock);
+ if (ret < 0) {
+ kdbus_reply_unref(wait);
+ wait = ERR_PTR(ret);
+ }
+ kdbus_conn_unref(dst);
+ return wait;
+}
+
+static int kdbus_conn_unicast(struct kdbus_conn *src,
+ struct kdbus_staging *staging)
+{
+ const struct kdbus_msg *msg = staging->msg;
+ struct kdbus_name_entry *name = NULL;
+ struct kdbus_reply *wait = NULL;
+ struct kdbus_conn *dst = NULL;
+ struct kdbus_bus *bus = src->ep->bus;
+ bool is_signal = (msg->flags & KDBUS_MSG_SIGNAL);
+ int ret = 0;
+
+ if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) ||
+ WARN_ON(!(msg->flags & KDBUS_MSG_EXPECT_REPLY) &&
+ msg->cookie_reply != 0))
+ return -EINVAL;
+
+ /* name-registry must be locked for lookup *and* collecting data */
+ down_read(&bus->name_registry->rwlock);
+
+ /* find and pin destination */
+
+ ret = kdbus_pin_dst(bus, staging, &name, &dst);
+ if (ret < 0)
+ goto exit;
+
+ if (is_signal) {
+ /* like broadcasts we eavesdrop even if the msg is dropped */
+ kdbus_bus_eavesdrop(bus, src, staging);
+
+ /* drop silently if peer is not interested or not privileged */
+ if (!kdbus_match_db_match_msg(dst->match_db, src, staging) ||
+ !kdbus_conn_policy_talk(dst, NULL, src))
+ goto exit;
+ } else if (!kdbus_conn_policy_talk(src, current_cred(), dst)) {
+ ret = -EPERM;
+ goto exit;
+ } else if (msg->flags & KDBUS_MSG_EXPECT_REPLY) {
+ wait = kdbus_reply_new(dst, src, msg, name, false);
+ if (IS_ERR(wait)) {
+ ret = PTR_ERR(wait);
+ wait = NULL;
+ goto exit;
+ }
+ }
+
+ /* send message */
+
+ if (!is_signal)
+ kdbus_bus_eavesdrop(bus, src, staging);
+
+ ret = kdbus_conn_entry_insert(src, dst, staging, wait, name);
+ if (ret < 0 && !is_signal)
+ goto exit;
+
+ /* signals are treated like broadcasts, recv-errors are ignored */
+ ret = 0;
+
+exit:
+ up_read(&bus->name_registry->rwlock);
+ kdbus_reply_unref(wait);
+ kdbus_conn_unref(dst);
+ return ret;
+}
+
+/**
+ * kdbus_conn_move_messages() - move messages from one connection to another
+ * @conn_dst: Connection to copy to
+ * @conn_src: Connection to copy from
+ * @name_id: Filter for the sequence number of the registered
+ * name, 0 means no filtering.
+ *
+ * Move all messages from one connection to another. This is used when
+ * an implementer connection is taking over/giving back a well-known name
+ * from/to an activator connection.
+ */
+void kdbus_conn_move_messages(struct kdbus_conn *conn_dst,
+ struct kdbus_conn *conn_src,
+ u64 name_id)
+{
+ struct kdbus_queue_entry *e, *e_tmp;
+ struct kdbus_reply *r, *r_tmp;
+ struct kdbus_bus *bus;
+ struct kdbus_conn *c;
+ LIST_HEAD(msg_list);
+ int i, ret = 0;
+
+ if (WARN_ON(conn_src == conn_dst))
+ return;
+
+ bus = conn_src->ep->bus;
+
+ /* lock order: domain -> bus -> ep -> names -> conn */
+ down_read(&bus->conn_rwlock);
+ hash_for_each(bus->conn_hash, i, c, hentry) {
+ if (c == conn_src || c == conn_dst)
+ continue;
+
+ mutex_lock(&c->lock);
+ list_for_each_entry_safe(r, r_tmp, &c->reply_list, entry) {
+ if (r->reply_src != conn_src)
+ continue;
+
+ /* filter messages for a specific name */
+ if (name_id > 0 && r->name_id != name_id)
+ continue;
+
+ kdbus_conn_unref(r->reply_src);
+ r->reply_src = kdbus_conn_ref(conn_dst);
+ }
+ mutex_unlock(&c->lock);
+ }
+ up_read(&bus->conn_rwlock);
+
+ kdbus_conn_lock2(conn_src, conn_dst);
+ list_for_each_entry_safe(e, e_tmp, &conn_src->queue.msg_list, entry) {
+ /* filter messages for a specific name */
+ if (name_id > 0 && e->dst_name_id != name_id)
+ continue;
+
+ if (!(conn_dst->flags & KDBUS_HELLO_ACCEPT_FD) &&
+ e->gaps && e->gaps->n_fds > 0) {
+ kdbus_conn_lost_message(conn_dst);
+ kdbus_queue_entry_free(e);
+ continue;
+ }
+
+ ret = kdbus_queue_entry_move(e, conn_dst);
+ if (ret < 0) {
+ kdbus_conn_lost_message(conn_dst);
+ kdbus_queue_entry_free(e);
+ continue;
+ }
+ }
+ kdbus_conn_unlock2(conn_src, conn_dst);
+
+ /* wake up poll() */
+ wake_up_interruptible(&conn_dst->wait);
+}
+
+/* query the policy-database for all names of @whom */
+static bool kdbus_conn_policy_query_all(struct kdbus_conn *conn,
+ const struct cred *conn_creds,
+ struct kdbus_policy_db *db,
+ struct kdbus_conn *whom,
+ unsigned int access)
+{
+ struct kdbus_name_entry *ne;
+ bool pass = false;
+ int res;
+
+ lockdep_assert_held(&conn->ep->bus->name_registry->rwlock);
+
+ down_read(&db->entries_rwlock);
+ mutex_lock(&whom->lock);
+
+ list_for_each_entry(ne, &whom->names_list, conn_entry) {
+ res = kdbus_policy_query_unlocked(db, conn_creds ? : conn->cred,
+ ne->name,
+ kdbus_strhash(ne->name));
+ if (res >= (int)access) {
+ pass = true;
+ break;
+ }
+ }
+
+ mutex_unlock(&whom->lock);
+ up_read(&db->entries_rwlock);
+
+ return pass;
+}
+
+/**
+ * kdbus_conn_policy_own_name() - verify a connection can own the given name
+ * @conn: Connection
+ * @conn_creds: Credentials of @conn to use for policy check
+ * @name: Name
+ *
+ * This verifies that @conn is allowed to acquire the well-known name @name.
+ *
+ * Return: true if allowed, false if not.
+ */
+bool kdbus_conn_policy_own_name(struct kdbus_conn *conn,
+ const struct cred *conn_creds,
+ const char *name)
+{
+ unsigned int hash = kdbus_strhash(name);
+ int res;
+
+ if (!conn_creds)
+ conn_creds = conn->cred;
+
+ if (conn->ep->user) {
+ res = kdbus_policy_query(&conn->ep->policy_db, conn_creds,
+ name, hash);
+ if (res < KDBUS_POLICY_OWN)
+ return false;
+ }
+
+ if (conn->privileged)
+ return true;
+
+ res = kdbus_policy_query(&conn->ep->bus->policy_db, conn_creds,
+ name, hash);
+ return res >= KDBUS_POLICY_OWN;
+}
+
+/**
+ * kdbus_conn_policy_talk() - verify a connection can talk to a given peer
+ * @conn: Connection that tries to talk
+ * @conn_creds: Credentials of @conn to use for policy check
+ * @to: Connection that is talked to
+ *
+ * This verifies that @conn is allowed to talk to @to.
+ *
+ * Return: true if allowed, false if not.
+ */
+bool kdbus_conn_policy_talk(struct kdbus_conn *conn,
+ const struct cred *conn_creds,
+ struct kdbus_conn *to)
+{
+ if (!conn_creds)
+ conn_creds = conn->cred;
+
+ if (conn->ep->user &&
+ !kdbus_conn_policy_query_all(conn, conn_creds, &conn->ep->policy_db,
+ to, KDBUS_POLICY_TALK))
+ return false;
+
+ if (conn->privileged)
+ return true;
+ if (uid_eq(conn_creds->euid, to->cred->uid))
+ return true;
+
+ return kdbus_conn_policy_query_all(conn, conn_creds,
+ &conn->ep->bus->policy_db, to,
+ KDBUS_POLICY_TALK);
+}
+
+/**
+ * kdbus_conn_policy_see_name_unlocked() - verify a connection can see a given
+ * name
+ * @conn: Connection
+ * @conn_creds: Credentials of @conn to use for policy check
+ * @name: Name
+ *
+ * This verifies that @conn is allowed to see the well-known name @name. Caller
+ * must hold policy-lock.
+ *
+ * Return: true if allowed, false if not.
+ */
+bool kdbus_conn_policy_see_name_unlocked(struct kdbus_conn *conn,
+ const struct cred *conn_creds,
+ const char *name)
+{
+ int res;
+
+ /*
+ * By default, all names are visible on a bus. SEE policies can only be
+ * installed on custom endpoints, where by default no name is visible.
+ */
+ if (!conn->ep->user)
+ return true;
+
+ res = kdbus_policy_query_unlocked(&conn->ep->policy_db,
+ conn_creds ? : conn->cred,
+ name, kdbus_strhash(name));
+ return res >= KDBUS_POLICY_SEE;
+}
+
+static bool kdbus_conn_policy_see_name(struct kdbus_conn *conn,
+ const struct cred *conn_creds,
+ const char *name)
+{
+ bool res;
+
+ down_read(&conn->ep->policy_db.entries_rwlock);
+ res = kdbus_conn_policy_see_name_unlocked(conn, conn_creds, name);
+ up_read(&conn->ep->policy_db.entries_rwlock);
+
+ return res;
+}
+
+static bool kdbus_conn_policy_see(struct kdbus_conn *conn,
+ const struct cred *conn_creds,
+ struct kdbus_conn *whom)
+{
+ /*
+ * By default, all names are visible on a bus, so a connection can
+ * always see other connections. SEE policies can only be installed on
+ * custom endpoints, where by default no name is visible and we hide
+ * peers from each other, unless you see at least _one_ name of the
+ * peer.
+ */
+ return !conn->ep->user ||
+ kdbus_conn_policy_query_all(conn, conn_creds,
+ &conn->ep->policy_db, whom,
+ KDBUS_POLICY_SEE);
+}
+
+/**
+ * kdbus_conn_policy_see_notification() - verify a connection is allowed to
+ * receive a given kernel notification
+ * @conn: Connection
+ * @conn_creds: Credentials of @conn to use for policy check
+ * @msg: Notification message
+ *
+ * This checks whether @conn is allowed to see the kernel notification.
+ *
+ * Return: true if allowed, false if not.
+ */
+bool kdbus_conn_policy_see_notification(struct kdbus_conn *conn,
+ const struct cred *conn_creds,
+ const struct kdbus_msg *msg)
+{
+ /*
+ * Depending on the notification type, broadcasted kernel notifications
+ * have to be filtered:
+ *
+ * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}: This notification is forwarded
+ * to a peer if, and only if, that peer can see the name this
+ * notification is for.
+ *
+ * KDBUS_ITEM_ID_{ADD,REMOVE}: Notifications for ID changes are
+ * broadcast to everyone, to allow tracking peers.
+ */
+
+ switch (msg->items[0].type) {
+ case KDBUS_ITEM_NAME_ADD:
+ case KDBUS_ITEM_NAME_REMOVE:
+ case KDBUS_ITEM_NAME_CHANGE:
+ return kdbus_conn_policy_see_name(conn, conn_creds,
+ msg->items[0].name_change.name);
+
+ case KDBUS_ITEM_ID_ADD:
+ case KDBUS_ITEM_ID_REMOVE:
+ return true;
+
+ default:
+ WARN(1, "Invalid type for notification broadcast: %llu\n",
+ (unsigned long long)msg->items[0].type);
+ return false;
+ }
+}
+
+/**
+ * kdbus_cmd_hello() - handle KDBUS_CMD_HELLO
+ * @ep: Endpoint to operate on
+ * @privileged: Whether the caller is privileged
+ * @argp: Command payload
+ *
+ * Return: NULL or newly created connection on success, ERR_PTR on failure.
+ */
+struct kdbus_conn *kdbus_cmd_hello(struct kdbus_ep *ep, bool privileged,
+ void __user *argp)
+{
+ struct kdbus_cmd_hello *cmd;
+ struct kdbus_conn *c = NULL;
+ const char *item_name;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_NAME },
+ { .type = KDBUS_ITEM_CREDS },
+ { .type = KDBUS_ITEM_PIDS },
+ { .type = KDBUS_ITEM_SECLABEL },
+ { .type = KDBUS_ITEM_CONN_DESCRIPTION },
+ { .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE |
+ KDBUS_HELLO_ACCEPT_FD |
+ KDBUS_HELLO_ACTIVATOR |
+ KDBUS_HELLO_POLICY_HOLDER |
+ KDBUS_HELLO_MONITOR,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+
+ item_name = argv[1].item ? argv[1].item->str : NULL;
+
+ c = kdbus_conn_new(ep, privileged, cmd, item_name,
+ argv[2].item ? &argv[2].item->creds : NULL,
+ argv[3].item ? &argv[3].item->pids : NULL,
+ argv[4].item ? argv[4].item->str : NULL,
+ argv[5].item ? argv[5].item->str : NULL);
+ if (IS_ERR(c)) {
+ ret = PTR_ERR(c);
+ c = NULL;
+ goto exit;
+ }
+
+ ret = kdbus_conn_connect(c, item_name);
+ if (ret < 0)
+ goto exit;
+
+ if (kdbus_conn_is_activator(c) || kdbus_conn_is_policy_holder(c)) {
+ ret = kdbus_conn_acquire(c);
+ if (ret < 0)
+ goto exit;
+
+ ret = kdbus_policy_set(&c->ep->bus->policy_db, args.items,
+ args.items_size, 1,
+ kdbus_conn_is_policy_holder(c), c);
+ kdbus_conn_release(c);
+ if (ret < 0)
+ goto exit;
+ }
+
+ if (copy_to_user(argp, cmd, sizeof(*cmd)))
+ ret = -EFAULT;
+
+exit:
+ ret = kdbus_args_clear(&args, ret);
+ if (ret < 0) {
+ if (c) {
+ kdbus_conn_disconnect(c, false);
+ kdbus_conn_unref(c);
+ }
+ return ERR_PTR(ret);
+ }
+ return c;
+}
+
+/**
+ * kdbus_cmd_byebye_unlocked() - handle KDBUS_CMD_BYEBYE
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * The caller must not hold any active reference to @conn or this will deadlock.
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_byebye_unlocked(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_cmd *cmd;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ if (!kdbus_conn_is_ordinary(conn))
+ return -EOPNOTSUPP;
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ ret = kdbus_conn_disconnect(conn, true);
+ return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_conn_info() - handle KDBUS_CMD_CONN_INFO
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_conn_info(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_meta_conn *conn_meta = NULL;
+ struct kdbus_pool_slice *slice = NULL;
+ struct kdbus_name_entry *entry = NULL;
+ struct kdbus_conn *owner_conn = NULL;
+ struct kdbus_item *meta_items = NULL;
+ struct kdbus_info info = {};
+ struct kdbus_cmd_info *cmd;
+ struct kdbus_bus *bus = conn->ep->bus;
+ struct kvec kvec[3];
+ size_t meta_size, cnt = 0;
+ const char *name;
+ u64 attach_flags, size = 0;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_NAME },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ /* registry must be held throughout lookup *and* collecting data */
+ down_read(&bus->name_registry->rwlock);
+
+ ret = kdbus_sanitize_attach_flags(cmd->attach_flags, &attach_flags);
+ if (ret < 0)
+ goto exit;
+
+ name = argv[1].item ? argv[1].item->str : NULL;
+
+ if (name) {
+ entry = kdbus_name_lookup_unlocked(bus->name_registry, name);
+ if (!entry || !entry->conn ||
+ !kdbus_conn_policy_see_name(conn, current_cred(), name) ||
+ (cmd->id != 0 && entry->conn->id != cmd->id)) {
+ /* pretend a name doesn't exist if you cannot see it */
+ ret = -ESRCH;
+ goto exit;
+ }
+
+ owner_conn = kdbus_conn_ref(entry->conn);
+ } else if (cmd->id > 0) {
+ owner_conn = kdbus_bus_find_conn_by_id(bus, cmd->id);
+ if (!owner_conn || !kdbus_conn_policy_see(conn, current_cred(),
+ owner_conn)) {
+ /* pretend an id doesn't exist if you cannot see it */
+ ret = -ENXIO;
+ goto exit;
+ }
+ } else {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ attach_flags &= atomic64_read(&owner_conn->attach_flags_send);
+
+ conn_meta = kdbus_meta_conn_new();
+ if (IS_ERR(conn_meta)) {
+ ret = PTR_ERR(conn_meta);
+ conn_meta = NULL;
+ goto exit;
+ }
+
+ ret = kdbus_meta_conn_collect(conn_meta, owner_conn, 0, attach_flags);
+ if (ret < 0)
+ goto exit;
+
+ ret = kdbus_meta_emit(owner_conn->meta_proc, owner_conn->meta_fake,
+ conn_meta, conn, attach_flags,
+ &meta_items, &meta_size);
+ if (ret < 0)
+ goto exit;
+
+ info.id = owner_conn->id;
+ info.flags = owner_conn->flags;
+
+ kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &size);
+ if (meta_size > 0) {
+ kdbus_kvec_set(&kvec[cnt++], meta_items, meta_size, &size);
+ cnt += !!kdbus_kvec_pad(&kvec[cnt], &size);
+ }
+
+ info.size = size;
+
+ slice = kdbus_pool_slice_alloc(conn->pool, size, false);
+ if (IS_ERR(slice)) {
+ ret = PTR_ERR(slice);
+ slice = NULL;
+ goto exit;
+ }
+
+ ret = kdbus_pool_slice_copy_kvec(slice, 0, kvec, cnt, size);
+ if (ret < 0)
+ goto exit;
+
+ kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->info_size);
+
+ if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) ||
+ kdbus_member_set_user(&cmd->info_size, argp,
+ typeof(*cmd), info_size)) {
+ ret = -EFAULT;
+ goto exit;
+ }
+
+ ret = 0;
+
+exit:
+ up_read(&bus->name_registry->rwlock);
+ kdbus_pool_slice_release(slice);
+ kfree(meta_items);
+ kdbus_meta_conn_unref(conn_meta);
+ kdbus_conn_unref(owner_conn);
+ return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_update() - handle KDBUS_CMD_UPDATE
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_update(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_item *item_policy;
+ u64 *item_attach_send = NULL;
+ u64 *item_attach_recv = NULL;
+ struct kdbus_cmd *cmd;
+ u64 attach_send;
+ u64 attach_recv;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_ATTACH_FLAGS_SEND },
+ { .type = KDBUS_ITEM_ATTACH_FLAGS_RECV },
+ { .type = KDBUS_ITEM_NAME, .multiple = true },
+ { .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ item_attach_send = argv[1].item ? &argv[1].item->data64[0] : NULL;
+ item_attach_recv = argv[2].item ? &argv[2].item->data64[0] : NULL;
+ item_policy = argv[3].item ? : argv[4].item;
+
+ if (item_attach_send) {
+ if (!kdbus_conn_is_ordinary(conn) &&
+ !kdbus_conn_is_monitor(conn)) {
+ ret = -EOPNOTSUPP;
+ goto exit;
+ }
+
+ ret = kdbus_sanitize_attach_flags(*item_attach_send,
+ &attach_send);
+ if (ret < 0)
+ goto exit;
+ }
+
+ if (item_attach_recv) {
+ if (!kdbus_conn_is_ordinary(conn) &&
+ !kdbus_conn_is_monitor(conn) &&
+ !kdbus_conn_is_activator(conn)) {
+ ret = -EOPNOTSUPP;
+ goto exit;
+ }
+
+ ret = kdbus_sanitize_attach_flags(*item_attach_recv,
+ &attach_recv);
+ if (ret < 0)
+ goto exit;
+ }
+
+ if (item_policy && !kdbus_conn_is_policy_holder(conn)) {
+ ret = -EOPNOTSUPP;
+ goto exit;
+ }
+
+ /* now that we verified the input, update the connection */
+
+ if (item_policy) {
+ ret = kdbus_policy_set(&conn->ep->bus->policy_db, cmd->items,
+ KDBUS_ITEMS_SIZE(cmd, items),
+ 1, true, conn);
+ if (ret < 0)
+ goto exit;
+ }
+
+ if (item_attach_send)
+ atomic64_set(&conn->attach_flags_send, attach_send);
+
+ if (item_attach_recv)
+ atomic64_set(&conn->attach_flags_recv, attach_recv);
+
+exit:
+ return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_send() - handle KDBUS_CMD_SEND
+ * @conn: connection to operate on
+ * @f: file this command was called on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_send(struct kdbus_conn *conn, struct file *f, void __user *argp)
+{
+ struct kdbus_cmd_send *cmd;
+ struct kdbus_staging *staging = NULL;
+ struct kdbus_msg *msg = NULL;
+ struct file *cancel_fd = NULL;
+ int ret, ret2;
+
+ /* command arguments */
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_CANCEL_FD },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE |
+ KDBUS_SEND_SYNC_REPLY,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ /* message arguments */
+ struct kdbus_arg msg_argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_PAYLOAD_VEC, .multiple = true },
+ { .type = KDBUS_ITEM_PAYLOAD_MEMFD, .multiple = true },
+ { .type = KDBUS_ITEM_FDS },
+ { .type = KDBUS_ITEM_BLOOM_FILTER },
+ { .type = KDBUS_ITEM_DST_NAME },
+ };
+ struct kdbus_args msg_args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE |
+ KDBUS_MSG_EXPECT_REPLY |
+ KDBUS_MSG_NO_AUTO_START |
+ KDBUS_MSG_SIGNAL,
+ .argv = msg_argv,
+ .argc = ARRAY_SIZE(msg_argv),
+ };
+
+ if (!kdbus_conn_is_ordinary(conn))
+ return -EOPNOTSUPP;
+
+ /* make sure to parse both, @cmd and @msg on negotiation */
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret < 0)
+ goto exit;
+ else if (ret > 0 && !cmd->msg_address) /* negotiation without msg */
+ goto exit;
+
+ ret2 = kdbus_args_parse_msg(&msg_args, KDBUS_PTR(cmd->msg_address),
+ &msg);
+ if (ret2 < 0) { /* cannot parse message */
+ ret = ret2;
+ goto exit;
+ } else if (ret2 > 0 && !ret) { /* msg-negot implies cmd-negot */
+ ret = -EINVAL;
+ goto exit;
+ } else if (ret > 0) { /* negotiation */
+ goto exit;
+ }
+
+ /* here we parsed both, @cmd and @msg, and neither wants negotiation */
+
+ cmd->reply.return_flags = 0;
+ kdbus_pool_publish_empty(conn->pool, &cmd->reply.offset,
+ &cmd->reply.msg_size);
+
+ if (argv[1].item) {
+ cancel_fd = fget(argv[1].item->fds[0]);
+ if (!cancel_fd) {
+ ret = -EBADF;
+ goto exit;
+ }
+
+ if (!cancel_fd->f_op->poll) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ }
+
+ /* patch-in the source of this message */
+ if (msg->src_id > 0 && msg->src_id != conn->id) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ msg->src_id = conn->id;
+
+ staging = kdbus_staging_new_user(conn->ep->bus, cmd, msg);
+ if (IS_ERR(staging)) {
+ ret = PTR_ERR(staging);
+ staging = NULL;
+ goto exit;
+ }
+
+ if (msg->dst_id == KDBUS_DST_ID_BROADCAST) {
+ down_read(&conn->ep->bus->name_registry->rwlock);
+ kdbus_bus_broadcast(conn->ep->bus, conn, staging);
+ up_read(&conn->ep->bus->name_registry->rwlock);
+ } else if (cmd->flags & KDBUS_SEND_SYNC_REPLY) {
+ struct kdbus_reply *r;
+ ktime_t exp;
+
+ exp = ns_to_ktime(msg->timeout_ns);
+ r = kdbus_conn_call(conn, staging, exp);
+ if (IS_ERR(r)) {
+ ret = PTR_ERR(r);
+ goto exit;
+ }
+
+ ret = kdbus_conn_wait_reply(conn, cmd, f, cancel_fd, r, exp);
+ kdbus_reply_unref(r);
+ if (ret < 0)
+ goto exit;
+ } else if ((msg->flags & KDBUS_MSG_EXPECT_REPLY) ||
+ msg->cookie_reply == 0) {
+ ret = kdbus_conn_unicast(conn, staging);
+ if (ret < 0)
+ goto exit;
+ } else {
+ ret = kdbus_conn_reply(conn, staging);
+ if (ret < 0)
+ goto exit;
+ }
+
+ if (kdbus_member_set_user(&cmd->reply, argp, typeof(*cmd), reply))
+ ret = -EFAULT;
+
+exit:
+ if (cancel_fd)
+ fput(cancel_fd);
+ kdbus_staging_free(staging);
+ ret = kdbus_args_clear(&msg_args, ret);
+ return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_recv() - handle KDBUS_CMD_RECV
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_recv(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_queue_entry *entry;
+ struct kdbus_cmd_recv *cmd;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE |
+ KDBUS_RECV_PEEK |
+ KDBUS_RECV_DROP |
+ KDBUS_RECV_USE_PRIORITY,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ if (!kdbus_conn_is_ordinary(conn) &&
+ !kdbus_conn_is_monitor(conn) &&
+ !kdbus_conn_is_activator(conn))
+ return -EOPNOTSUPP;
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ cmd->dropped_msgs = 0;
+ cmd->msg.return_flags = 0;
+ kdbus_pool_publish_empty(conn->pool, &cmd->msg.offset,
+ &cmd->msg.msg_size);
+
+ /* DROP+priority is not realiably, so prevent it */
+ if ((cmd->flags & KDBUS_RECV_DROP) &&
+ (cmd->flags & KDBUS_RECV_USE_PRIORITY)) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ mutex_lock(&conn->lock);
+
+ entry = kdbus_queue_peek(&conn->queue, cmd->priority,
+ cmd->flags & KDBUS_RECV_USE_PRIORITY);
+ if (!entry) {
+ mutex_unlock(&conn->lock);
+ ret = -EAGAIN;
+ } else if (cmd->flags & KDBUS_RECV_DROP) {
+ struct kdbus_reply *reply = kdbus_reply_ref(entry->reply);
+
+ kdbus_queue_entry_free(entry);
+
+ mutex_unlock(&conn->lock);
+
+ if (reply) {
+ mutex_lock(&reply->reply_dst->lock);
+ if (!list_empty(&reply->entry)) {
+ kdbus_reply_unlink(reply);
+ if (reply->sync)
+ kdbus_sync_reply_wakeup(reply, -EPIPE);
+ else
+ kdbus_notify_reply_dead(conn->ep->bus,
+ reply->reply_dst->id,
+ reply->cookie);
+ }
+ mutex_unlock(&reply->reply_dst->lock);
+ kdbus_notify_flush(conn->ep->bus);
+ }
+
+ kdbus_reply_unref(reply);
+ } else {
+ bool install_fds;
+
+ /*
+ * PEEK just returns the location of the next message. Do not
+ * install FDs nor memfds nor anything else. The only
+ * information of interest should be the message header and
+ * metadata. Any FD numbers in the payload is undefined for
+ * PEEK'ed messages.
+ * Also make sure to never install fds into a connection that
+ * has refused to receive any. Ordinary connections will not get
+ * messages with FDs queued (the receiver will get -ECOMM), but
+ * eavesdroppers might.
+ */
+ install_fds = (conn->flags & KDBUS_HELLO_ACCEPT_FD) &&
+ !(cmd->flags & KDBUS_RECV_PEEK);
+
+ ret = kdbus_queue_entry_install(entry,
+ &cmd->msg.return_flags,
+ install_fds);
+ if (ret < 0) {
+ mutex_unlock(&conn->lock);
+ goto exit;
+ }
+
+ kdbus_pool_slice_publish(entry->slice, &cmd->msg.offset,
+ &cmd->msg.msg_size);
+
+ if (!(cmd->flags & KDBUS_RECV_PEEK))
+ kdbus_queue_entry_free(entry);
+
+ mutex_unlock(&conn->lock);
+ }
+
+ cmd->dropped_msgs = atomic_xchg(&conn->lost_count, 0);
+ if (cmd->dropped_msgs > 0)
+ cmd->return_flags |= KDBUS_RECV_RETURN_DROPPED_MSGS;
+
+ if (kdbus_member_set_user(&cmd->msg, argp, typeof(*cmd), msg) ||
+ kdbus_member_set_user(&cmd->dropped_msgs, argp, typeof(*cmd),
+ dropped_msgs))
+ ret = -EFAULT;
+
+exit:
+ return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_free() - handle KDBUS_CMD_FREE
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_free(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_cmd_free *cmd;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ if (!kdbus_conn_is_ordinary(conn) &&
+ !kdbus_conn_is_monitor(conn) &&
+ !kdbus_conn_is_activator(conn))
+ return -EOPNOTSUPP;
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ ret = kdbus_pool_release_offset(conn->pool, cmd->offset);
+
+ return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/connection.h b/ipc/kdbus/connection.h
new file mode 100644
index 000000000..5ee864eb0
--- /dev/null
+++ b/ipc/kdbus/connection.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_CONNECTION_H
+#define __KDBUS_CONNECTION_H
+
+#include <linux/atomic.h>
+#include <linux/kref.h>
+#include <linux/lockdep.h>
+#include <linux/path.h>
+
+#include "limits.h"
+#include "metadata.h"
+#include "pool.h"
+#include "queue.h"
+#include "util.h"
+
+#define KDBUS_HELLO_SPECIAL_CONN (KDBUS_HELLO_ACTIVATOR | \
+ KDBUS_HELLO_POLICY_HOLDER | \
+ KDBUS_HELLO_MONITOR)
+
+struct kdbus_quota;
+struct kdbus_staging;
+
+/**
+ * struct kdbus_conn - connection to a bus
+ * @kref: Reference count
+ * @active: Active references to the connection
+ * @id: Connection ID
+ * @flags: KDBUS_HELLO_* flags
+ * @attach_flags_send: KDBUS_ATTACH_* flags for sending
+ * @attach_flags_recv: KDBUS_ATTACH_* flags for receiving
+ * @description: Human-readable connection description, used for
+ * debugging. This field is only set when the
+ * connection is created.
+ * @ep: The endpoint this connection belongs to
+ * @lock: Connection data lock
+ * @hentry: Entry in ID <-> connection map
+ * @ep_entry: Entry in endpoint
+ * @monitor_entry: Entry in monitor, if the connection is a monitor
+ * @reply_list: List of connections this connection should
+ * reply to
+ * @work: Delayed work to handle timeouts
+ * activator for
+ * @match_db: Subscription filter to broadcast messages
+ * @meta_proc: Process metadata of connection creator, or NULL
+ * @meta_fake: Faked metadata, or NULL
+ * @pool: The user's buffer to receive messages
+ * @user: Owner of the connection
+ * @cred: The credentials of the connection at creation time
+ * @pid: Pid at creation time
+ * @root_path: Root path at creation time
+ * @name_count: Number of owned well-known names
+ * @request_count: Number of pending requests issued by this
+ * connection that are waiting for replies from
+ * other peers
+ * @lost_count: Number of lost broadcast messages
+ * @wait: Wake up this endpoint
+ * @queue: The message queue associated with this connection
+ * @quota: Array of per-user quota indexed by user->id
+ * @n_quota: Number of elements in quota array
+ * @activator_of: Well-known name entry this connection acts as an
+ * @names_list: List of well-known names
+ * @names_queue_list: Well-known names this connection waits for
+ * @privileged: Whether this connection is privileged on the bus
+ */
+struct kdbus_conn {
+ struct kref kref;
+ atomic_t active;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+ u64 id;
+ u64 flags;
+ atomic64_t attach_flags_send;
+ atomic64_t attach_flags_recv;
+ const char *description;
+ struct kdbus_ep *ep;
+ struct mutex lock;
+ struct hlist_node hentry;
+ struct list_head ep_entry;
+ struct list_head monitor_entry;
+ struct list_head reply_list;
+ struct delayed_work work;
+ struct kdbus_match_db *match_db;
+ struct kdbus_meta_proc *meta_proc;
+ struct kdbus_meta_fake *meta_fake;
+ struct kdbus_pool *pool;
+ struct kdbus_user *user;
+ const struct cred *cred;
+ struct pid *pid;
+ struct path root_path;
+ atomic_t name_count;
+ atomic_t request_count;
+ atomic_t lost_count;
+ wait_queue_head_t wait;
+ struct kdbus_queue queue;
+
+ struct kdbus_quota *quota;
+ unsigned int n_quota;
+
+ /* protected by registry->rwlock */
+ struct kdbus_name_entry *activator_of;
+ struct list_head names_list;
+ struct list_head names_queue_list;
+
+ bool privileged:1;
+};
+
+struct kdbus_conn *kdbus_conn_ref(struct kdbus_conn *conn);
+struct kdbus_conn *kdbus_conn_unref(struct kdbus_conn *conn);
+bool kdbus_conn_active(const struct kdbus_conn *conn);
+int kdbus_conn_acquire(struct kdbus_conn *conn);
+void kdbus_conn_release(struct kdbus_conn *conn);
+int kdbus_conn_disconnect(struct kdbus_conn *conn, bool ensure_queue_empty);
+bool kdbus_conn_has_name(struct kdbus_conn *conn, const char *name);
+int kdbus_conn_quota_inc(struct kdbus_conn *c, struct kdbus_user *u,
+ size_t memory, size_t fds);
+void kdbus_conn_quota_dec(struct kdbus_conn *c, struct kdbus_user *u,
+ size_t memory, size_t fds);
+void kdbus_conn_lost_message(struct kdbus_conn *c);
+int kdbus_conn_entry_insert(struct kdbus_conn *conn_src,
+ struct kdbus_conn *conn_dst,
+ struct kdbus_staging *staging,
+ struct kdbus_reply *reply,
+ const struct kdbus_name_entry *name);
+void kdbus_conn_move_messages(struct kdbus_conn *conn_dst,
+ struct kdbus_conn *conn_src,
+ u64 name_id);
+
+/* policy */
+bool kdbus_conn_policy_own_name(struct kdbus_conn *conn,
+ const struct cred *conn_creds,
+ const char *name);
+bool kdbus_conn_policy_talk(struct kdbus_conn *conn,
+ const struct cred *conn_creds,
+ struct kdbus_conn *to);
+bool kdbus_conn_policy_see_name_unlocked(struct kdbus_conn *conn,
+ const struct cred *curr_creds,
+ const char *name);
+bool kdbus_conn_policy_see_notification(struct kdbus_conn *conn,
+ const struct cred *curr_creds,
+ const struct kdbus_msg *msg);
+
+/* command dispatcher */
+struct kdbus_conn *kdbus_cmd_hello(struct kdbus_ep *ep, bool privileged,
+ void __user *argp);
+int kdbus_cmd_byebye_unlocked(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_conn_info(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_update(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_send(struct kdbus_conn *conn, struct file *f, void __user *argp);
+int kdbus_cmd_recv(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_free(struct kdbus_conn *conn, void __user *argp);
+
+/**
+ * kdbus_conn_is_ordinary() - Check if connection is ordinary
+ * @conn: The connection to check
+ *
+ * Return: Non-zero if the connection is an ordinary connection
+ */
+static inline int kdbus_conn_is_ordinary(const struct kdbus_conn *conn)
+{
+ return !(conn->flags & KDBUS_HELLO_SPECIAL_CONN);
+}
+
+/**
+ * kdbus_conn_is_activator() - Check if connection is an activator
+ * @conn: The connection to check
+ *
+ * Return: Non-zero if the connection is an activator
+ */
+static inline int kdbus_conn_is_activator(const struct kdbus_conn *conn)
+{
+ return conn->flags & KDBUS_HELLO_ACTIVATOR;
+}
+
+/**
+ * kdbus_conn_is_policy_holder() - Check if connection is a policy holder
+ * @conn: The connection to check
+ *
+ * Return: Non-zero if the connection is a policy holder
+ */
+static inline int kdbus_conn_is_policy_holder(const struct kdbus_conn *conn)
+{
+ return conn->flags & KDBUS_HELLO_POLICY_HOLDER;
+}
+
+/**
+ * kdbus_conn_is_monitor() - Check if connection is a monitor
+ * @conn: The connection to check
+ *
+ * Return: Non-zero if the connection is a monitor
+ */
+static inline int kdbus_conn_is_monitor(const struct kdbus_conn *conn)
+{
+ return conn->flags & KDBUS_HELLO_MONITOR;
+}
+
+/**
+ * kdbus_conn_lock2() - Lock two connections
+ * @a: connection A to lock or NULL
+ * @b: connection B to lock or NULL
+ *
+ * Lock two connections at once. As we need to have a stable locking order, we
+ * always lock the connection with lower memory address first.
+ */
+static inline void kdbus_conn_lock2(struct kdbus_conn *a, struct kdbus_conn *b)
+{
+ if (a < b) {
+ if (a)
+ mutex_lock(&a->lock);
+ if (b && b != a)
+ mutex_lock_nested(&b->lock, !!a);
+ } else {
+ if (b)
+ mutex_lock(&b->lock);
+ if (a && a != b)
+ mutex_lock_nested(&a->lock, !!b);
+ }
+}
+
+/**
+ * kdbus_conn_unlock2() - Unlock two connections
+ * @a: connection A to unlock or NULL
+ * @b: connection B to unlock or NULL
+ *
+ * Unlock two connections at once. See kdbus_conn_lock2().
+ */
+static inline void kdbus_conn_unlock2(struct kdbus_conn *a,
+ struct kdbus_conn *b)
+{
+ if (a)
+ mutex_unlock(&a->lock);
+ if (b && b != a)
+ mutex_unlock(&b->lock);
+}
+
+/**
+ * kdbus_conn_assert_active() - lockdep assert on active lock
+ * @conn: connection that shall be active
+ *
+ * This verifies via lockdep that the caller holds an active reference to the
+ * given connection.
+ */
+static inline void kdbus_conn_assert_active(struct kdbus_conn *conn)
+{
+ lockdep_assert_held(conn);
+}
+
+#endif
diff --git a/ipc/kdbus/domain.c b/ipc/kdbus/domain.c
new file mode 100644
index 000000000..ac9f760c1
--- /dev/null
+++ b/ipc/kdbus/domain.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "bus.h"
+#include "domain.h"
+#include "handle.h"
+#include "item.h"
+#include "limits.h"
+#include "util.h"
+
+static void kdbus_domain_control_free(struct kdbus_node *node)
+{
+ kfree(node);
+}
+
+static struct kdbus_node *kdbus_domain_control_new(struct kdbus_domain *domain,
+ unsigned int access)
+{
+ struct kdbus_node *node;
+ int ret;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+
+ kdbus_node_init(node, KDBUS_NODE_CONTROL);
+
+ node->free_cb = kdbus_domain_control_free;
+ node->mode = domain->node.mode;
+ node->mode = S_IRUSR | S_IWUSR;
+ if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD))
+ node->mode |= S_IRGRP | S_IWGRP;
+ if (access & KDBUS_MAKE_ACCESS_WORLD)
+ node->mode |= S_IROTH | S_IWOTH;
+
+ ret = kdbus_node_link(node, &domain->node, "control");
+ if (ret < 0)
+ goto exit_free;
+
+ return node;
+
+exit_free:
+ kdbus_node_deactivate(node);
+ kdbus_node_unref(node);
+ return ERR_PTR(ret);
+}
+
+static void kdbus_domain_free(struct kdbus_node *node)
+{
+ struct kdbus_domain *domain =
+ container_of(node, struct kdbus_domain, node);
+
+ put_user_ns(domain->user_namespace);
+ ida_destroy(&domain->user_ida);
+ idr_destroy(&domain->user_idr);
+ kfree(domain);
+}
+
+/**
+ * kdbus_domain_new() - create a new domain
+ * @access: The access mode for this node (KDBUS_MAKE_ACCESS_*)
+ *
+ * Return: a new kdbus_domain on success, ERR_PTR on failure
+ */
+struct kdbus_domain *kdbus_domain_new(unsigned int access)
+{
+ struct kdbus_domain *d;
+ int ret;
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return ERR_PTR(-ENOMEM);
+
+ kdbus_node_init(&d->node, KDBUS_NODE_DOMAIN);
+
+ d->node.free_cb = kdbus_domain_free;
+ d->node.mode = S_IRUSR | S_IXUSR;
+ if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD))
+ d->node.mode |= S_IRGRP | S_IXGRP;
+ if (access & KDBUS_MAKE_ACCESS_WORLD)
+ d->node.mode |= S_IROTH | S_IXOTH;
+
+ mutex_init(&d->lock);
+ idr_init(&d->user_idr);
+ ida_init(&d->user_ida);
+
+ /* Pin user namespace so we can guarantee domain-unique bus * names. */
+ d->user_namespace = get_user_ns(current_user_ns());
+
+ ret = kdbus_node_link(&d->node, NULL, NULL);
+ if (ret < 0)
+ goto exit_unref;
+
+ return d;
+
+exit_unref:
+ kdbus_node_deactivate(&d->node);
+ kdbus_node_unref(&d->node);
+ return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_domain_ref() - take a domain reference
+ * @domain: Domain
+ *
+ * Return: the domain itself
+ */
+struct kdbus_domain *kdbus_domain_ref(struct kdbus_domain *domain)
+{
+ if (domain)
+ kdbus_node_ref(&domain->node);
+ return domain;
+}
+
+/**
+ * kdbus_domain_unref() - drop a domain reference
+ * @domain: Domain
+ *
+ * When the last reference is dropped, the domain internal structure
+ * is freed.
+ *
+ * Return: NULL
+ */
+struct kdbus_domain *kdbus_domain_unref(struct kdbus_domain *domain)
+{
+ if (domain)
+ kdbus_node_unref(&domain->node);
+ return NULL;
+}
+
+/**
+ * kdbus_domain_populate() - populate static domain nodes
+ * @domain: domain to populate
+ * @access: KDBUS_MAKE_ACCESS_* access restrictions for new nodes
+ *
+ * Allocate and activate static sub-nodes of the given domain. This will fail if
+ * you call it on a non-active node or if the domain was already populated.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_domain_populate(struct kdbus_domain *domain, unsigned int access)
+{
+ struct kdbus_node *control;
+
+ /*
+ * Create a control-node for this domain. We drop our own reference
+ * immediately, effectively causing the node to be deactivated and
+ * released when the parent domain is.
+ */
+ control = kdbus_domain_control_new(domain, access);
+ if (IS_ERR(control))
+ return PTR_ERR(control);
+
+ kdbus_node_activate(control);
+ kdbus_node_unref(control);
+ return 0;
+}
+
+/**
+ * kdbus_user_lookup() - lookup a kdbus_user object
+ * @domain: domain of the user
+ * @uid: uid of the user; INVALID_UID for an anon user
+ *
+ * Lookup the kdbus user accounting object for the given domain. If INVALID_UID
+ * is passed, a new anonymous user is created which is private to the caller.
+ *
+ * Return: The user object is returned, ERR_PTR on failure.
+ */
+struct kdbus_user *kdbus_user_lookup(struct kdbus_domain *domain, kuid_t uid)
+{
+ struct kdbus_user *u = NULL, *old = NULL;
+ int ret;
+
+ mutex_lock(&domain->lock);
+
+ if (uid_valid(uid)) {
+ old = idr_find(&domain->user_idr, __kuid_val(uid));
+ /*
+ * If the object is about to be destroyed, ignore it and
+ * replace the slot in the IDR later on.
+ */
+ if (old && kref_get_unless_zero(&old->kref)) {
+ mutex_unlock(&domain->lock);
+ return old;
+ }
+ }
+
+ u = kzalloc(sizeof(*u), GFP_KERNEL);
+ if (!u) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ kref_init(&u->kref);
+ u->domain = kdbus_domain_ref(domain);
+ u->uid = uid;
+ atomic_set(&u->buses, 0);
+ atomic_set(&u->connections, 0);
+
+ if (uid_valid(uid)) {
+ if (old) {
+ idr_replace(&domain->user_idr, u, __kuid_val(uid));
+ old->uid = INVALID_UID; /* mark old as removed */
+ } else {
+ ret = idr_alloc(&domain->user_idr, u, __kuid_val(uid),
+ __kuid_val(uid) + 1, GFP_KERNEL);
+ if (ret < 0)
+ goto exit;
+ }
+ }
+
+ /*
+ * Allocate the smallest possible index for this user; used
+ * in arrays for accounting user quota in receiver queues.
+ */
+ ret = ida_simple_get(&domain->user_ida, 1, 0, GFP_KERNEL);
+ if (ret < 0)
+ goto exit;
+
+ u->id = ret;
+ mutex_unlock(&domain->lock);
+ return u;
+
+exit:
+ if (u) {
+ if (uid_valid(u->uid))
+ idr_remove(&domain->user_idr, __kuid_val(u->uid));
+ kdbus_domain_unref(u->domain);
+ kfree(u);
+ }
+ mutex_unlock(&domain->lock);
+ return ERR_PTR(ret);
+}
+
+static void __kdbus_user_free(struct kref *kref)
+{
+ struct kdbus_user *user = container_of(kref, struct kdbus_user, kref);
+
+ WARN_ON(atomic_read(&user->buses) > 0);
+ WARN_ON(atomic_read(&user->connections) > 0);
+
+ mutex_lock(&user->domain->lock);
+ ida_simple_remove(&user->domain->user_ida, user->id);
+ if (uid_valid(user->uid))
+ idr_remove(&user->domain->user_idr, __kuid_val(user->uid));
+ mutex_unlock(&user->domain->lock);
+
+ kdbus_domain_unref(user->domain);
+ kfree(user);
+}
+
+/**
+ * kdbus_user_ref() - take a user reference
+ * @u: User
+ *
+ * Return: @u is returned
+ */
+struct kdbus_user *kdbus_user_ref(struct kdbus_user *u)
+{
+ if (u)
+ kref_get(&u->kref);
+ return u;
+}
+
+/**
+ * kdbus_user_unref() - drop a user reference
+ * @u: User
+ *
+ * Return: NULL
+ */
+struct kdbus_user *kdbus_user_unref(struct kdbus_user *u)
+{
+ if (u)
+ kref_put(&u->kref, __kdbus_user_free);
+ return NULL;
+}
diff --git a/ipc/kdbus/domain.h b/ipc/kdbus/domain.h
new file mode 100644
index 000000000..447a2bd4d
--- /dev/null
+++ b/ipc/kdbus/domain.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_DOMAIN_H
+#define __KDBUS_DOMAIN_H
+
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/user_namespace.h>
+
+#include "node.h"
+
+/**
+ * struct kdbus_domain - domain for buses
+ * @node: Underlying API node
+ * @lock: Domain data lock
+ * @last_id: Last used object id
+ * @user_idr: Set of all users indexed by UID
+ * @user_ida: Set of all users to compute small indices
+ * @user_namespace: User namespace, pinned at creation time
+ * @dentry: Root dentry of VFS mount (don't use outside of kdbusfs)
+ */
+struct kdbus_domain {
+ struct kdbus_node node;
+ struct mutex lock;
+ atomic64_t last_id;
+ struct idr user_idr;
+ struct ida user_ida;
+ struct user_namespace *user_namespace;
+ struct dentry *dentry;
+};
+
+/**
+ * struct kdbus_user - resource accounting for users
+ * @kref: Reference counter
+ * @domain: Domain of the user
+ * @id: Index of this user
+ * @uid: UID of the user
+ * @buses: Number of buses the user has created
+ * @connections: Number of connections the user has created
+ */
+struct kdbus_user {
+ struct kref kref;
+ struct kdbus_domain *domain;
+ unsigned int id;
+ kuid_t uid;
+ atomic_t buses;
+ atomic_t connections;
+};
+
+#define kdbus_domain_from_node(_node) \
+ container_of((_node), struct kdbus_domain, node)
+
+struct kdbus_domain *kdbus_domain_new(unsigned int access);
+struct kdbus_domain *kdbus_domain_ref(struct kdbus_domain *domain);
+struct kdbus_domain *kdbus_domain_unref(struct kdbus_domain *domain);
+int kdbus_domain_populate(struct kdbus_domain *domain, unsigned int access);
+
+#define KDBUS_USER_KERNEL_ID 0 /* ID 0 is reserved for kernel accounting */
+
+struct kdbus_user *kdbus_user_lookup(struct kdbus_domain *domain, kuid_t uid);
+struct kdbus_user *kdbus_user_ref(struct kdbus_user *u);
+struct kdbus_user *kdbus_user_unref(struct kdbus_user *u);
+
+#endif
diff --git a/ipc/kdbus/endpoint.c b/ipc/kdbus/endpoint.c
new file mode 100644
index 000000000..977964dbb
--- /dev/null
+++ b/ipc/kdbus/endpoint.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "message.h"
+#include "policy.h"
+
+static void kdbus_ep_free(struct kdbus_node *node)
+{
+ struct kdbus_ep *ep = container_of(node, struct kdbus_ep, node);
+
+ WARN_ON(!list_empty(&ep->conn_list));
+
+ kdbus_policy_db_clear(&ep->policy_db);
+ kdbus_bus_unref(ep->bus);
+ kdbus_user_unref(ep->user);
+ kfree(ep);
+}
+
+static void kdbus_ep_release(struct kdbus_node *node, bool was_active)
+{
+ struct kdbus_ep *ep = container_of(node, struct kdbus_ep, node);
+
+ /* disconnect all connections to this endpoint */
+ for (;;) {
+ struct kdbus_conn *conn;
+
+ mutex_lock(&ep->lock);
+ conn = list_first_entry_or_null(&ep->conn_list,
+ struct kdbus_conn,
+ ep_entry);
+ if (!conn) {
+ mutex_unlock(&ep->lock);
+ break;
+ }
+
+ /* take reference, release lock, disconnect without lock */
+ kdbus_conn_ref(conn);
+ mutex_unlock(&ep->lock);
+
+ kdbus_conn_disconnect(conn, false);
+ kdbus_conn_unref(conn);
+ }
+}
+
+/**
+ * kdbus_ep_new() - create a new endpoint
+ * @bus: The bus this endpoint will be created for
+ * @name: The name of the endpoint
+ * @access: The access flags for this node (KDBUS_MAKE_ACCESS_*)
+ * @uid: The uid of the node
+ * @gid: The gid of the node
+ * @is_custom: Whether this is a custom endpoint
+ *
+ * This function will create a new endpoint with the given
+ * name and properties for a given bus.
+ *
+ * Return: a new kdbus_ep on success, ERR_PTR on failure.
+ */
+struct kdbus_ep *kdbus_ep_new(struct kdbus_bus *bus, const char *name,
+ unsigned int access, kuid_t uid, kgid_t gid,
+ bool is_custom)
+{
+ struct kdbus_ep *e;
+ int ret;
+
+ /*
+ * Validate only custom endpoints names, default endpoints
+ * with a "bus" name are created when the bus is created
+ */
+ if (is_custom) {
+ ret = kdbus_verify_uid_prefix(name, bus->domain->user_namespace,
+ uid);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
+ e = kzalloc(sizeof(*e), GFP_KERNEL);
+ if (!e)
+ return ERR_PTR(-ENOMEM);
+
+ kdbus_node_init(&e->node, KDBUS_NODE_ENDPOINT);
+
+ e->node.free_cb = kdbus_ep_free;
+ e->node.release_cb = kdbus_ep_release;
+ e->node.uid = uid;
+ e->node.gid = gid;
+ e->node.mode = S_IRUSR | S_IWUSR;
+ if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD))
+ e->node.mode |= S_IRGRP | S_IWGRP;
+ if (access & KDBUS_MAKE_ACCESS_WORLD)
+ e->node.mode |= S_IROTH | S_IWOTH;
+
+ mutex_init(&e->lock);
+ INIT_LIST_HEAD(&e->conn_list);
+ kdbus_policy_db_init(&e->policy_db);
+ e->bus = kdbus_bus_ref(bus);
+
+ ret = kdbus_node_link(&e->node, &bus->node, name);
+ if (ret < 0)
+ goto exit_unref;
+
+ /*
+ * Transactions on custom endpoints are never accounted on the global
+ * user limits. Instead, for each custom endpoint, we create a custom,
+ * unique user, which all transactions are accounted on. Regardless of
+ * the user using that endpoint, it is always accounted on the same
+ * user-object. This budget is not shared with ordinary users on
+ * non-custom endpoints.
+ */
+ if (is_custom) {
+ e->user = kdbus_user_lookup(bus->domain, INVALID_UID);
+ if (IS_ERR(e->user)) {
+ ret = PTR_ERR(e->user);
+ e->user = NULL;
+ goto exit_unref;
+ }
+ }
+
+ return e;
+
+exit_unref:
+ kdbus_node_deactivate(&e->node);
+ kdbus_node_unref(&e->node);
+ return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_ep_ref() - increase the reference counter of a kdbus_ep
+ * @ep: The endpoint to reference
+ *
+ * Every user of an endpoint, except for its creator, must add a reference to
+ * the kdbus_ep instance using this function.
+ *
+ * Return: the ep itself
+ */
+struct kdbus_ep *kdbus_ep_ref(struct kdbus_ep *ep)
+{
+ if (ep)
+ kdbus_node_ref(&ep->node);
+ return ep;
+}
+
+/**
+ * kdbus_ep_unref() - decrease the reference counter of a kdbus_ep
+ * @ep: The ep to unref
+ *
+ * Release a reference. If the reference count drops to 0, the ep will be
+ * freed.
+ *
+ * Return: NULL
+ */
+struct kdbus_ep *kdbus_ep_unref(struct kdbus_ep *ep)
+{
+ if (ep)
+ kdbus_node_unref(&ep->node);
+ return NULL;
+}
+
+/**
+ * kdbus_cmd_ep_make() - handle KDBUS_CMD_ENDPOINT_MAKE
+ * @bus: bus to operate on
+ * @argp: command payload
+ *
+ * Return: NULL or newly created endpoint on success, ERR_PTR on failure.
+ */
+struct kdbus_ep *kdbus_cmd_ep_make(struct kdbus_bus *bus, void __user *argp)
+{
+ const char *item_make_name;
+ struct kdbus_ep *ep = NULL;
+ struct kdbus_cmd *cmd;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_MAKE_NAME, .mandatory = true },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE |
+ KDBUS_MAKE_ACCESS_GROUP |
+ KDBUS_MAKE_ACCESS_WORLD,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+
+ item_make_name = argv[1].item->str;
+
+ ep = kdbus_ep_new(bus, item_make_name, cmd->flags,
+ current_euid(), current_egid(), true);
+ if (IS_ERR(ep)) {
+ ret = PTR_ERR(ep);
+ ep = NULL;
+ goto exit;
+ }
+
+ if (!kdbus_node_activate(&ep->node)) {
+ ret = -ESHUTDOWN;
+ goto exit;
+ }
+
+exit:
+ ret = kdbus_args_clear(&args, ret);
+ if (ret < 0) {
+ if (ep) {
+ kdbus_node_deactivate(&ep->node);
+ kdbus_ep_unref(ep);
+ }
+ return ERR_PTR(ret);
+ }
+ return ep;
+}
+
+/**
+ * kdbus_cmd_ep_update() - handle KDBUS_CMD_ENDPOINT_UPDATE
+ * @ep: endpoint to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_ep_update(struct kdbus_ep *ep, void __user *argp)
+{
+ struct kdbus_cmd *cmd;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_NAME, .multiple = true },
+ { .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ ret = kdbus_policy_set(&ep->policy_db, args.items, args.items_size,
+ 0, true, ep);
+ return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/endpoint.h b/ipc/kdbus/endpoint.h
new file mode 100644
index 000000000..bc1b94a70
--- /dev/null
+++ b/ipc/kdbus/endpoint.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_ENDPOINT_H
+#define __KDBUS_ENDPOINT_H
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/uidgid.h>
+#include "node.h"
+#include "policy.h"
+
+struct kdbus_bus;
+struct kdbus_user;
+
+/**
+ * struct kdbus_ep - endpoint to access a bus
+ * @node: The kdbus node
+ * @lock: Endpoint data lock
+ * @bus: Bus behind this endpoint
+ * @user: Custom enpoints account against an anonymous user
+ * @policy_db: Uploaded policy
+ * @conn_list: Connections of this endpoint
+ *
+ * An endpoint offers access to a bus; the default endpoint node name is "bus".
+ * Additional custom endpoints to the same bus can be created and they can
+ * carry their own policies/filters.
+ */
+struct kdbus_ep {
+ struct kdbus_node node;
+ struct mutex lock;
+
+ /* static */
+ struct kdbus_bus *bus;
+ struct kdbus_user *user;
+
+ /* protected by own locks */
+ struct kdbus_policy_db policy_db;
+
+ /* protected by ep->lock */
+ struct list_head conn_list;
+};
+
+#define kdbus_ep_from_node(_node) \
+ container_of((_node), struct kdbus_ep, node)
+
+struct kdbus_ep *kdbus_ep_new(struct kdbus_bus *bus, const char *name,
+ unsigned int access, kuid_t uid, kgid_t gid,
+ bool policy);
+struct kdbus_ep *kdbus_ep_ref(struct kdbus_ep *ep);
+struct kdbus_ep *kdbus_ep_unref(struct kdbus_ep *ep);
+
+struct kdbus_ep *kdbus_cmd_ep_make(struct kdbus_bus *bus, void __user *argp);
+int kdbus_cmd_ep_update(struct kdbus_ep *ep, void __user *argp);
+
+#endif
diff --git a/ipc/kdbus/fs.c b/ipc/kdbus/fs.c
new file mode 100644
index 000000000..09c480924
--- /dev/null
+++ b/ipc/kdbus/fs.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/init.h>
+#include <linux/ipc_namespace.h>
+#include <linux/magic.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "bus.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "fs.h"
+#include "handle.h"
+#include "node.h"
+
+#define kdbus_node_from_dentry(_dentry) \
+ ((struct kdbus_node *)(_dentry)->d_fsdata)
+
+static struct inode *fs_inode_get(struct super_block *sb,
+ struct kdbus_node *node);
+
+/*
+ * Directory Management
+ */
+
+static inline unsigned char kdbus_dt_type(struct kdbus_node *node)
+{
+ switch (node->type) {
+ case KDBUS_NODE_DOMAIN:
+ case KDBUS_NODE_BUS:
+ return DT_DIR;
+ case KDBUS_NODE_CONTROL:
+ case KDBUS_NODE_ENDPOINT:
+ return DT_REG;
+ }
+
+ return DT_UNKNOWN;
+}
+
+static int fs_dir_fop_iterate(struct file *file, struct dir_context *ctx)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct kdbus_node *parent = kdbus_node_from_dentry(dentry);
+ struct kdbus_node *old, *next = file->private_data;
+
+ /*
+ * kdbusfs directory iterator (modelled after sysfs/kernfs)
+ * When iterating kdbusfs directories, we iterate all children of the
+ * parent kdbus_node object. We use ctx->pos to store the hash of the
+ * child and file->private_data to store a reference to the next node
+ * object. If ctx->pos is not modified via llseek while you iterate a
+ * directory, then we use the file->private_data node pointer to
+ * directly access the next node in the tree.
+ * However, if you directly seek on the directory, we have to find the
+ * closest node to that position and cannot use our node pointer. This
+ * means iterating the rb-tree to find the closest match and start over
+ * from there.
+ * Note that hash values are not necessarily unique. Therefore, llseek
+ * is not guaranteed to seek to the same node that you got when you
+ * retrieved the position. Seeking to 0, 1, 2 and >=INT_MAX is safe,
+ * though. We could use the inode-number as position, but this would
+ * require another rb-tree for fast access. Kernfs and others already
+ * ignore those conflicts, so we should be fine, too.
+ */
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ /* acquire @next; if deactivated, or seek detected, find next node */
+ old = next;
+ if (next && ctx->pos == next->hash) {
+ if (kdbus_node_acquire(next))
+ kdbus_node_ref(next);
+ else
+ next = kdbus_node_next_child(parent, next);
+ } else {
+ next = kdbus_node_find_closest(parent, ctx->pos);
+ }
+ kdbus_node_unref(old);
+
+ while (next) {
+ /* emit @next */
+ file->private_data = next;
+ ctx->pos = next->hash;
+
+ kdbus_node_release(next);
+
+ if (!dir_emit(ctx, next->name, strlen(next->name), next->id,
+ kdbus_dt_type(next)))
+ return 0;
+
+ /* find next node after @next */
+ old = next;
+ next = kdbus_node_next_child(parent, next);
+ kdbus_node_unref(old);
+ }
+
+ file->private_data = NULL;
+ ctx->pos = INT_MAX;
+
+ return 0;
+}
+
+static loff_t fs_dir_fop_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file_inode(file);
+ loff_t ret;
+
+ /* protect f_off against fop_iterate */
+ mutex_lock(&inode->i_mutex);
+ ret = generic_file_llseek(file, offset, whence);
+ mutex_unlock(&inode->i_mutex);
+
+ return ret;
+}
+
+static int fs_dir_fop_release(struct inode *inode, struct file *file)
+{
+ kdbus_node_unref(file->private_data);
+ return 0;
+}
+
+static const struct file_operations fs_dir_fops = {
+ .read = generic_read_dir,
+ .iterate = fs_dir_fop_iterate,
+ .llseek = fs_dir_fop_llseek,
+ .release = fs_dir_fop_release,
+};
+
+static struct dentry *fs_dir_iop_lookup(struct inode *dir,
+ struct dentry *dentry,
+ unsigned int flags)
+{
+ struct dentry *dnew = NULL;
+ struct kdbus_node *parent;
+ struct kdbus_node *node;
+ struct inode *inode;
+
+ parent = kdbus_node_from_dentry(dentry->d_parent);
+ if (!kdbus_node_acquire(parent))
+ return NULL;
+
+ /* returns reference to _acquired_ child node */
+ node = kdbus_node_find_child(parent, dentry->d_name.name);
+ if (node) {
+ dentry->d_fsdata = node;
+ inode = fs_inode_get(dir->i_sb, node);
+ if (IS_ERR(inode))
+ dnew = ERR_CAST(inode);
+ else
+ dnew = d_splice_alias(inode, dentry);
+
+ kdbus_node_release(node);
+ }
+
+ kdbus_node_release(parent);
+ return dnew;
+}
+
+static const struct inode_operations fs_dir_iops = {
+ .permission = generic_permission,
+ .lookup = fs_dir_iop_lookup,
+};
+
+/*
+ * Inode Management
+ */
+
+static const struct inode_operations fs_inode_iops = {
+ .permission = generic_permission,
+};
+
+static struct inode *fs_inode_get(struct super_block *sb,
+ struct kdbus_node *node)
+{
+ struct inode *inode;
+
+ inode = iget_locked(sb, node->id);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ inode->i_private = kdbus_node_ref(node);
+ inode->i_mapping->a_ops = &empty_aops;
+ inode->i_mode = node->mode & S_IALLUGO;
+ inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ inode->i_uid = node->uid;
+ inode->i_gid = node->gid;
+
+ switch (node->type) {
+ case KDBUS_NODE_DOMAIN:
+ case KDBUS_NODE_BUS:
+ inode->i_mode |= S_IFDIR;
+ inode->i_op = &fs_dir_iops;
+ inode->i_fop = &fs_dir_fops;
+ set_nlink(inode, 2);
+ break;
+ case KDBUS_NODE_CONTROL:
+ case KDBUS_NODE_ENDPOINT:
+ inode->i_mode |= S_IFREG;
+ inode->i_op = &fs_inode_iops;
+ inode->i_fop = &kdbus_handle_ops;
+ break;
+ }
+
+ unlock_new_inode(inode);
+
+ return inode;
+}
+
+/*
+ * Superblock Management
+ */
+
+static int fs_super_dop_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct kdbus_node *node;
+
+ /* Force lookup on negatives */
+ if (!dentry->d_inode)
+ return 0;
+
+ node = kdbus_node_from_dentry(dentry);
+
+ /* see whether the node has been removed */
+ if (!kdbus_node_is_active(node))
+ return 0;
+
+ return 1;
+}
+
+static void fs_super_dop_release(struct dentry *dentry)
+{
+ kdbus_node_unref(dentry->d_fsdata);
+}
+
+static const struct dentry_operations fs_super_dops = {
+ .d_revalidate = fs_super_dop_revalidate,
+ .d_release = fs_super_dop_release,
+};
+
+static void fs_super_sop_evict_inode(struct inode *inode)
+{
+ struct kdbus_node *node = kdbus_node_from_inode(inode);
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+ kdbus_node_unref(node);
+}
+
+static const struct super_operations fs_super_sops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = fs_super_sop_evict_inode,
+};
+
+static int fs_super_fill(struct super_block *sb)
+{
+ struct kdbus_domain *domain = sb->s_fs_info;
+ struct inode *inode;
+ int ret;
+
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = KDBUS_SUPER_MAGIC;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_op = &fs_super_sops;
+ sb->s_time_gran = 1;
+
+ inode = fs_inode_get(sb, &domain->node);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root) {
+ /* d_make_root iput()s the inode on failure */
+ return -ENOMEM;
+ }
+
+ /* sb holds domain reference */
+ sb->s_root->d_fsdata = &domain->node;
+ sb->s_d_op = &fs_super_dops;
+
+ /* sb holds root reference */
+ domain->dentry = sb->s_root;
+
+ if (!kdbus_node_activate(&domain->node))
+ return -ESHUTDOWN;
+
+ ret = kdbus_domain_populate(domain, KDBUS_MAKE_ACCESS_WORLD);
+ if (ret < 0)
+ return ret;
+
+ sb->s_flags |= MS_ACTIVE;
+ return 0;
+}
+
+static void fs_super_kill(struct super_block *sb)
+{
+ struct kdbus_domain *domain = sb->s_fs_info;
+
+ if (domain) {
+ kdbus_node_deactivate(&domain->node);
+ domain->dentry = NULL;
+ }
+
+ kill_anon_super(sb);
+ kdbus_domain_unref(domain);
+}
+
+static int fs_super_set(struct super_block *sb, void *data)
+{
+ int ret;
+
+ ret = set_anon_super(sb, data);
+ if (!ret)
+ sb->s_fs_info = data;
+
+ return ret;
+}
+
+static struct dentry *fs_super_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
+{
+ struct kdbus_domain *domain;
+ struct super_block *sb;
+ int ret;
+
+ domain = kdbus_domain_new(KDBUS_MAKE_ACCESS_WORLD);
+ if (IS_ERR(domain))
+ return ERR_CAST(domain);
+
+ sb = sget(fs_type, NULL, fs_super_set, flags, domain);
+ if (IS_ERR(sb)) {
+ kdbus_node_deactivate(&domain->node);
+ kdbus_domain_unref(domain);
+ return ERR_CAST(sb);
+ }
+
+ WARN_ON(sb->s_fs_info != domain);
+ WARN_ON(sb->s_root);
+
+ ret = fs_super_fill(sb);
+ if (ret < 0) {
+ /* calls into ->kill_sb() when done */
+ deactivate_locked_super(sb);
+ return ERR_PTR(ret);
+ }
+
+ return dget(sb->s_root);
+}
+
+static struct file_system_type fs_type = {
+ .name = KBUILD_MODNAME "fs",
+ .owner = THIS_MODULE,
+ .mount = fs_super_mount,
+ .kill_sb = fs_super_kill,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+/**
+ * kdbus_fs_init() - register kdbus filesystem
+ *
+ * This registers a filesystem with the VFS layer. The filesystem is called
+ * `KBUILD_MODNAME "fs"', which usually resolves to `kdbusfs'. The nameing
+ * scheme allows to set KBUILD_MODNAME to "kdbus2" and you will get an
+ * independent filesystem for developers.
+ *
+ * Each mount of the kdbusfs filesystem has an kdbus_domain attached.
+ * Operations on this mount will only affect the attached domain. On each mount
+ * a new domain is automatically created and used for this mount exclusively.
+ * If you want to share a domain across multiple mounts, you need to bind-mount
+ * it.
+ *
+ * Mounts of kdbusfs (with a different domain each) are unrelated to each other
+ * and will never have any effect on any domain but their own.
+ *
+ * Return: 0 on success, negative error otherwise.
+ */
+int kdbus_fs_init(void)
+{
+ return register_filesystem(&fs_type);
+}
+
+/**
+ * kdbus_fs_exit() - unregister kdbus filesystem
+ *
+ * This does the reverse to kdbus_fs_init(). It unregisters the kdbusfs
+ * filesystem from VFS and cleans up any allocated resources.
+ */
+void kdbus_fs_exit(void)
+{
+ unregister_filesystem(&fs_type);
+}
+
+/* acquire domain of @node, making sure all ancestors are active */
+static struct kdbus_domain *fs_acquire_domain(struct kdbus_node *node)
+{
+ struct kdbus_domain *domain;
+ struct kdbus_node *iter;
+
+ /* caller must guarantee that @node is linked */
+ for (iter = node; iter->parent; iter = iter->parent)
+ if (!kdbus_node_is_active(iter->parent))
+ return NULL;
+
+ /* root nodes are always domains */
+ if (WARN_ON(iter->type != KDBUS_NODE_DOMAIN))
+ return NULL;
+
+ domain = kdbus_domain_from_node(iter);
+ if (!kdbus_node_acquire(&domain->node))
+ return NULL;
+
+ return domain;
+}
+
+/**
+ * kdbus_fs_flush() - flush dcache entries of a node
+ * @node: Node to flush entries of
+ *
+ * This flushes all VFS filesystem cache entries for a node and all its
+ * children. This should be called whenever a node is destroyed during
+ * runtime. It will flush the cache entries so the linked objects can be
+ * deallocated.
+ *
+ * This is a no-op if you call it on active nodes (they really should stay in
+ * cache) or on nodes with deactivated parents (flushing the parent is enough).
+ * Furthermore, there is no need to call it on nodes whose lifetime is bound to
+ * their parents'. In those cases, the parent-flush will always also flush the
+ * children.
+ */
+void kdbus_fs_flush(struct kdbus_node *node)
+{
+ struct dentry *dentry, *parent_dentry = NULL;
+ struct kdbus_domain *domain;
+ struct qstr name;
+
+ /* active nodes should remain in cache */
+ if (!kdbus_node_is_deactivated(node))
+ return;
+
+ /* nodes that were never linked were never instantiated */
+ if (!node->parent)
+ return;
+
+ /* acquire domain and verify all ancestors are active */
+ domain = fs_acquire_domain(node);
+ if (!domain)
+ return;
+
+ switch (node->type) {
+ case KDBUS_NODE_ENDPOINT:
+ if (WARN_ON(!node->parent || !node->parent->name))
+ goto exit;
+
+ name.name = node->parent->name;
+ name.len = strlen(node->parent->name);
+ parent_dentry = d_hash_and_lookup(domain->dentry, &name);
+ if (IS_ERR_OR_NULL(parent_dentry))
+ goto exit;
+
+ /* fallthrough */
+ case KDBUS_NODE_BUS:
+ if (WARN_ON(!node->name))
+ goto exit;
+
+ name.name = node->name;
+ name.len = strlen(node->name);
+ dentry = d_hash_and_lookup(parent_dentry ? : domain->dentry,
+ &name);
+ if (!IS_ERR_OR_NULL(dentry)) {
+ d_invalidate(dentry);
+ dput(dentry);
+ }
+
+ dput(parent_dentry);
+ break;
+
+ default:
+ /* all other types are bound to their parent lifetime */
+ break;
+ }
+
+exit:
+ kdbus_node_release(&domain->node);
+}
diff --git a/ipc/kdbus/fs.h b/ipc/kdbus/fs.h
new file mode 100644
index 000000000..62f7d6abf
--- /dev/null
+++ b/ipc/kdbus/fs.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUSFS_H
+#define __KDBUSFS_H
+
+#include <linux/kernel.h>
+
+struct kdbus_node;
+
+int kdbus_fs_init(void);
+void kdbus_fs_exit(void);
+void kdbus_fs_flush(struct kdbus_node *node);
+
+#define kdbus_node_from_inode(_inode) \
+ ((struct kdbus_node *)(_inode)->i_private)
+
+#endif
diff --git a/ipc/kdbus/handle.c b/ipc/kdbus/handle.c
new file mode 100644
index 000000000..e0e06b0e1
--- /dev/null
+++ b/ipc/kdbus/handle.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "fs.h"
+#include "handle.h"
+#include "item.h"
+#include "match.h"
+#include "message.h"
+#include "names.h"
+#include "domain.h"
+#include "policy.h"
+
+static int kdbus_args_verify(struct kdbus_args *args)
+{
+ struct kdbus_item *item;
+ size_t i;
+ int ret;
+
+ KDBUS_ITEMS_FOREACH(item, args->items, args->items_size) {
+ struct kdbus_arg *arg = NULL;
+
+ if (!KDBUS_ITEM_VALID(item, args->items, args->items_size))
+ return -EINVAL;
+
+ for (i = 0; i < args->argc; ++i)
+ if (args->argv[i].type == item->type)
+ break;
+ if (i >= args->argc)
+ return -EINVAL;
+
+ arg = &args->argv[i];
+
+ ret = kdbus_item_validate(item);
+ if (ret < 0)
+ return ret;
+
+ if (arg->item && !arg->multiple)
+ return -EINVAL;
+
+ arg->item = item;
+ }
+
+ if (!KDBUS_ITEMS_END(item, args->items, args->items_size))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int kdbus_args_negotiate(struct kdbus_args *args)
+{
+ struct kdbus_item __user *user;
+ struct kdbus_item *negotiation;
+ size_t i, j, num;
+
+ /*
+ * If KDBUS_FLAG_NEGOTIATE is set, we overwrite the flags field with
+ * the set of supported flags. Furthermore, if an KDBUS_ITEM_NEGOTIATE
+ * item is passed, we iterate its payload (array of u64, each set to an
+ * item type) and clear all unsupported item-types to 0.
+ * The caller might do this recursively, if other flags or objects are
+ * embedded in the payload itself.
+ */
+
+ if (args->cmd->flags & KDBUS_FLAG_NEGOTIATE) {
+ if (put_user(args->allowed_flags & ~KDBUS_FLAG_NEGOTIATE,
+ &args->user->flags))
+ return -EFAULT;
+ }
+
+ if (args->argc < 1 || args->argv[0].type != KDBUS_ITEM_NEGOTIATE ||
+ !args->argv[0].item)
+ return 0;
+
+ negotiation = args->argv[0].item;
+ user = (struct kdbus_item __user *)
+ ((u8 __user *)args->user +
+ ((u8 *)negotiation - (u8 *)args->cmd));
+ num = KDBUS_ITEM_PAYLOAD_SIZE(negotiation) / sizeof(u64);
+
+ for (i = 0; i < num; ++i) {
+ for (j = 0; j < args->argc; ++j)
+ if (negotiation->data64[i] == args->argv[j].type)
+ break;
+
+ if (j < args->argc)
+ continue;
+
+ /* this item is not supported, clear it out */
+ negotiation->data64[i] = 0;
+ if (put_user(negotiation->data64[i], &user->data64[i]))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/**
+ * __kdbus_args_parse() - parse payload of kdbus command
+ * @args: object to parse data into
+ * @is_cmd: whether this is a command or msg payload
+ * @argp: user-space location of command payload to parse
+ * @type_size: overall size of command payload to parse
+ * @items_offset: offset of items array in command payload
+ * @out: output variable to store pointer to copied payload
+ *
+ * This parses the ioctl payload at user-space location @argp into @args. @args
+ * must be pre-initialized by the caller to reflect the supported flags and
+ * items of this command. This parser will then copy the command payload into
+ * kernel-space, verify correctness and consistency and cache pointers to parsed
+ * items and other data in @args.
+ *
+ * If this function succeeded, you must call kdbus_args_clear() to release
+ * allocated resources before destroying @args.
+ *
+ * This can also be used to import kdbus_msg objects. In that case, @is_cmd must
+ * be set to 'false' and the 'return_flags' field will not be touched (as it
+ * doesn't exist on kdbus_msg).
+ *
+ * Return: On failure a negative error code is returned. Otherwise, 1 is
+ * returned if negotiation was requested, 0 if not.
+ */
+int __kdbus_args_parse(struct kdbus_args *args, bool is_cmd, void __user *argp,
+ size_t type_size, size_t items_offset, void **out)
+{
+ u64 user_size;
+ int ret, i;
+
+ ret = kdbus_copy_from_user(&user_size, argp, sizeof(user_size));
+ if (ret < 0)
+ return ret;
+
+ if (user_size < type_size)
+ return -EINVAL;
+ if (user_size > KDBUS_CMD_MAX_SIZE)
+ return -EMSGSIZE;
+
+ if (user_size <= sizeof(args->cmd_buf)) {
+ if (copy_from_user(args->cmd_buf, argp, user_size))
+ return -EFAULT;
+ args->cmd = (void*)args->cmd_buf;
+ } else {
+ args->cmd = memdup_user(argp, user_size);
+ if (IS_ERR(args->cmd))
+ return PTR_ERR(args->cmd);
+ }
+
+ if (args->cmd->size != user_size) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (is_cmd)
+ args->cmd->return_flags = 0;
+ args->user = argp;
+ args->items = (void *)((u8 *)args->cmd + items_offset);
+ args->items_size = args->cmd->size - items_offset;
+ args->is_cmd = is_cmd;
+
+ if (args->cmd->flags & ~args->allowed_flags) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ ret = kdbus_args_verify(args);
+ if (ret < 0)
+ goto error;
+
+ ret = kdbus_args_negotiate(args);
+ if (ret < 0)
+ goto error;
+
+ /* mandatory items must be given (but not on negotiation) */
+ if (!(args->cmd->flags & KDBUS_FLAG_NEGOTIATE)) {
+ for (i = 0; i < args->argc; ++i)
+ if (args->argv[i].mandatory && !args->argv[i].item) {
+ ret = -EINVAL;
+ goto error;
+ }
+ }
+
+ *out = args->cmd;
+ return !!(args->cmd->flags & KDBUS_FLAG_NEGOTIATE);
+
+error:
+ return kdbus_args_clear(args, ret);
+}
+
+/**
+ * kdbus_args_clear() - release allocated command resources
+ * @args: object to release resources of
+ * @ret: return value of this command
+ *
+ * This frees all allocated resources on @args and copies the command result
+ * flags into user-space. @ret is usually returned unchanged by this function,
+ * so it can be used in the final 'return' statement of the command handler.
+ *
+ * Return: -EFAULT if return values cannot be copied into user-space, otherwise
+ * @ret is returned unchanged.
+ */
+int kdbus_args_clear(struct kdbus_args *args, int ret)
+{
+ if (!args)
+ return ret;
+
+ if (!IS_ERR_OR_NULL(args->cmd)) {
+ if (args->is_cmd && put_user(args->cmd->return_flags,
+ &args->user->return_flags))
+ ret = -EFAULT;
+ if (args->cmd != (void*)args->cmd_buf)
+ kfree(args->cmd);
+ args->cmd = NULL;
+ }
+
+ return ret;
+}
+
+/**
+ * enum kdbus_handle_type - type an handle can be of
+ * @KDBUS_HANDLE_NONE: no type set, yet
+ * @KDBUS_HANDLE_BUS_OWNER: bus owner
+ * @KDBUS_HANDLE_EP_OWNER: endpoint owner
+ * @KDBUS_HANDLE_CONNECTED: endpoint connection after HELLO
+ */
+enum kdbus_handle_type {
+ KDBUS_HANDLE_NONE,
+ KDBUS_HANDLE_BUS_OWNER,
+ KDBUS_HANDLE_EP_OWNER,
+ KDBUS_HANDLE_CONNECTED,
+};
+
+/**
+ * struct kdbus_handle - handle to the kdbus system
+ * @lock: handle lock
+ * @type: type of this handle (KDBUS_HANDLE_*)
+ * @bus_owner: bus this handle owns
+ * @ep_owner: endpoint this handle owns
+ * @conn: connection this handle owns
+ * @privileged: Flag to mark a handle as privileged
+ */
+struct kdbus_handle {
+ struct mutex lock;
+
+ enum kdbus_handle_type type;
+ union {
+ struct kdbus_bus *bus_owner;
+ struct kdbus_ep *ep_owner;
+ struct kdbus_conn *conn;
+ };
+
+ bool privileged:1;
+};
+
+static int kdbus_handle_open(struct inode *inode, struct file *file)
+{
+ struct kdbus_handle *handle;
+ struct kdbus_node *node;
+ int ret;
+
+ node = kdbus_node_from_inode(inode);
+ if (!kdbus_node_acquire(node))
+ return -ESHUTDOWN;
+
+ handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+ if (!handle) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ mutex_init(&handle->lock);
+ handle->type = KDBUS_HANDLE_NONE;
+
+ if (node->type == KDBUS_NODE_ENDPOINT) {
+ struct kdbus_ep *ep = kdbus_ep_from_node(node);
+ struct kdbus_bus *bus = ep->bus;
+
+ /*
+ * A connection is privileged if it is opened on an endpoint
+ * without custom policy and either:
+ * * the user has CAP_IPC_OWNER in the domain user namespace
+ * or
+ * * the callers euid matches the uid of the bus creator
+ */
+ if (!ep->user &&
+ (ns_capable(bus->domain->user_namespace, CAP_IPC_OWNER) ||
+ uid_eq(file->f_cred->euid, bus->node.uid)))
+ handle->privileged = true;
+ }
+
+ file->private_data = handle;
+ ret = 0;
+
+exit:
+ kdbus_node_release(node);
+ return ret;
+}
+
+static int kdbus_handle_release(struct inode *inode, struct file *file)
+{
+ struct kdbus_handle *handle = file->private_data;
+
+ switch (handle->type) {
+ case KDBUS_HANDLE_BUS_OWNER:
+ if (handle->bus_owner) {
+ kdbus_node_deactivate(&handle->bus_owner->node);
+ kdbus_bus_unref(handle->bus_owner);
+ }
+ break;
+ case KDBUS_HANDLE_EP_OWNER:
+ if (handle->ep_owner) {
+ kdbus_node_deactivate(&handle->ep_owner->node);
+ kdbus_ep_unref(handle->ep_owner);
+ }
+ break;
+ case KDBUS_HANDLE_CONNECTED:
+ kdbus_conn_disconnect(handle->conn, false);
+ kdbus_conn_unref(handle->conn);
+ break;
+ case KDBUS_HANDLE_NONE:
+ /* nothing to clean up */
+ break;
+ }
+
+ kfree(handle);
+
+ return 0;
+}
+
+static long kdbus_handle_ioctl_control(struct file *file, unsigned int cmd,
+ void __user *argp)
+{
+ struct kdbus_handle *handle = file->private_data;
+ struct kdbus_node *node = file_inode(file)->i_private;
+ struct kdbus_domain *domain;
+ int ret = 0;
+
+ if (!kdbus_node_acquire(node))
+ return -ESHUTDOWN;
+
+ /*
+ * The parent of control-nodes is always a domain, make sure to pin it
+ * so the parent is actually valid.
+ */
+ domain = kdbus_domain_from_node(node->parent);
+ if (!kdbus_node_acquire(&domain->node)) {
+ kdbus_node_release(node);
+ return -ESHUTDOWN;
+ }
+
+ switch (cmd) {
+ case KDBUS_CMD_BUS_MAKE: {
+ struct kdbus_bus *bus;
+
+ bus = kdbus_cmd_bus_make(domain, argp);
+ if (IS_ERR_OR_NULL(bus)) {
+ ret = PTR_ERR_OR_ZERO(bus);
+ break;
+ }
+
+ handle->bus_owner = bus;
+ ret = KDBUS_HANDLE_BUS_OWNER;
+ break;
+ }
+
+ default:
+ ret = -EBADFD;
+ break;
+ }
+
+ kdbus_node_release(&domain->node);
+ kdbus_node_release(node);
+ return ret;
+}
+
+static long kdbus_handle_ioctl_ep(struct file *file, unsigned int cmd,
+ void __user *buf)
+{
+ struct kdbus_handle *handle = file->private_data;
+ struct kdbus_node *node = file_inode(file)->i_private;
+ struct kdbus_ep *ep, *file_ep = kdbus_ep_from_node(node);
+ struct kdbus_conn *conn;
+ int ret = 0;
+
+ if (!kdbus_node_acquire(node))
+ return -ESHUTDOWN;
+
+ switch (cmd) {
+ case KDBUS_CMD_ENDPOINT_MAKE:
+ /* creating custom endpoints is a privileged operation */
+ if (!handle->privileged) {
+ ret = -EPERM;
+ break;
+ }
+
+ ep = kdbus_cmd_ep_make(file_ep->bus, buf);
+ if (IS_ERR_OR_NULL(ep)) {
+ ret = PTR_ERR_OR_ZERO(ep);
+ break;
+ }
+
+ handle->ep_owner = ep;
+ ret = KDBUS_HANDLE_EP_OWNER;
+ break;
+
+ case KDBUS_CMD_HELLO:
+ conn = kdbus_cmd_hello(file_ep, handle->privileged, buf);
+ if (IS_ERR_OR_NULL(conn)) {
+ ret = PTR_ERR_OR_ZERO(conn);
+ break;
+ }
+
+ handle->conn = conn;
+ ret = KDBUS_HANDLE_CONNECTED;
+ break;
+
+ default:
+ ret = -EBADFD;
+ break;
+ }
+
+ kdbus_node_release(node);
+ return ret;
+}
+
+static long kdbus_handle_ioctl_ep_owner(struct file *file, unsigned int command,
+ void __user *buf)
+{
+ struct kdbus_handle *handle = file->private_data;
+ struct kdbus_ep *ep = handle->ep_owner;
+ int ret;
+
+ if (!kdbus_node_acquire(&ep->node))
+ return -ESHUTDOWN;
+
+ switch (command) {
+ case KDBUS_CMD_ENDPOINT_UPDATE:
+ ret = kdbus_cmd_ep_update(ep, buf);
+ break;
+ default:
+ ret = -EBADFD;
+ break;
+ }
+
+ kdbus_node_release(&ep->node);
+ return ret;
+}
+
+static long kdbus_handle_ioctl_connected(struct file *file,
+ unsigned int command, void __user *buf)
+{
+ struct kdbus_handle *handle = file->private_data;
+ struct kdbus_conn *conn = handle->conn;
+ struct kdbus_conn *release_conn = NULL;
+ int ret;
+
+ release_conn = conn;
+ ret = kdbus_conn_acquire(release_conn);
+ if (ret < 0)
+ return ret;
+
+ switch (command) {
+ case KDBUS_CMD_BYEBYE:
+ /*
+ * BYEBYE is special; we must not acquire a connection when
+ * calling into kdbus_conn_disconnect() or we will deadlock,
+ * because kdbus_conn_disconnect() will wait for all acquired
+ * references to be dropped.
+ */
+ kdbus_conn_release(release_conn);
+ release_conn = NULL;
+ ret = kdbus_cmd_byebye_unlocked(conn, buf);
+ break;
+ case KDBUS_CMD_NAME_ACQUIRE:
+ ret = kdbus_cmd_name_acquire(conn, buf);
+ break;
+ case KDBUS_CMD_NAME_RELEASE:
+ ret = kdbus_cmd_name_release(conn, buf);
+ break;
+ case KDBUS_CMD_LIST:
+ ret = kdbus_cmd_list(conn, buf);
+ break;
+ case KDBUS_CMD_CONN_INFO:
+ ret = kdbus_cmd_conn_info(conn, buf);
+ break;
+ case KDBUS_CMD_BUS_CREATOR_INFO:
+ ret = kdbus_cmd_bus_creator_info(conn, buf);
+ break;
+ case KDBUS_CMD_UPDATE:
+ ret = kdbus_cmd_update(conn, buf);
+ break;
+ case KDBUS_CMD_MATCH_ADD:
+ ret = kdbus_cmd_match_add(conn, buf);
+ break;
+ case KDBUS_CMD_MATCH_REMOVE:
+ ret = kdbus_cmd_match_remove(conn, buf);
+ break;
+ case KDBUS_CMD_SEND:
+ ret = kdbus_cmd_send(conn, file, buf);
+ break;
+ case KDBUS_CMD_RECV:
+ ret = kdbus_cmd_recv(conn, buf);
+ break;
+ case KDBUS_CMD_FREE:
+ ret = kdbus_cmd_free(conn, buf);
+ break;
+ default:
+ ret = -EBADFD;
+ break;
+ }
+
+ kdbus_conn_release(release_conn);
+ return ret;
+}
+
+static long kdbus_handle_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct kdbus_handle *handle = file->private_data;
+ struct kdbus_node *node = kdbus_node_from_inode(file_inode(file));
+ void __user *argp = (void __user *)arg;
+ long ret = -EBADFD;
+
+ switch (cmd) {
+ case KDBUS_CMD_BUS_MAKE:
+ case KDBUS_CMD_ENDPOINT_MAKE:
+ case KDBUS_CMD_HELLO:
+ mutex_lock(&handle->lock);
+ if (handle->type == KDBUS_HANDLE_NONE) {
+ if (node->type == KDBUS_NODE_CONTROL)
+ ret = kdbus_handle_ioctl_control(file, cmd,
+ argp);
+ else if (node->type == KDBUS_NODE_ENDPOINT)
+ ret = kdbus_handle_ioctl_ep(file, cmd, argp);
+
+ if (ret > 0) {
+ /*
+ * The data given via open() is not sufficient
+ * to setup a kdbus handle. Hence, we require
+ * the user to perform a setup ioctl. This setup
+ * can only be performed once and defines the
+ * type of the handle. The different setup
+ * ioctls are locked against each other so they
+ * cannot race. Once the handle type is set,
+ * the type-dependent ioctls are enabled. To
+ * improve performance, we don't lock those via
+ * handle->lock. Instead, we issue a
+ * write-barrier before performing the
+ * type-change, which pairs with smp_rmb() in
+ * all handlers that access the type field. This
+ * guarantees the handle is fully setup, if
+ * handle->type is set. If handle->type is
+ * unset, you must not make any assumptions
+ * without taking handle->lock.
+ * Note that handle->type is only set once. It
+ * will never change afterwards.
+ */
+ smp_wmb();
+ handle->type = ret;
+ }
+ }
+ mutex_unlock(&handle->lock);
+ break;
+
+ case KDBUS_CMD_ENDPOINT_UPDATE:
+ case KDBUS_CMD_BYEBYE:
+ case KDBUS_CMD_NAME_ACQUIRE:
+ case KDBUS_CMD_NAME_RELEASE:
+ case KDBUS_CMD_LIST:
+ case KDBUS_CMD_CONN_INFO:
+ case KDBUS_CMD_BUS_CREATOR_INFO:
+ case KDBUS_CMD_UPDATE:
+ case KDBUS_CMD_MATCH_ADD:
+ case KDBUS_CMD_MATCH_REMOVE:
+ case KDBUS_CMD_SEND:
+ case KDBUS_CMD_RECV:
+ case KDBUS_CMD_FREE: {
+ enum kdbus_handle_type type;
+
+ /*
+ * This read-barrier pairs with smp_wmb() of the handle setup.
+ * it guarantees the handle is fully written, in case the
+ * type has been set. It allows us to access the handle without
+ * taking handle->lock, given the guarantee that the type is
+ * only ever set once, and stays constant afterwards.
+ * Furthermore, the handle object itself is not modified in any
+ * way after the type is set. That is, the type-field is the
+ * last field that is written on any handle. If it has not been
+ * set, we must not access the handle here.
+ */
+ type = handle->type;
+ smp_rmb();
+
+ if (type == KDBUS_HANDLE_EP_OWNER)
+ ret = kdbus_handle_ioctl_ep_owner(file, cmd, argp);
+ else if (type == KDBUS_HANDLE_CONNECTED)
+ ret = kdbus_handle_ioctl_connected(file, cmd, argp);
+
+ break;
+ }
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+
+ return ret < 0 ? ret : 0;
+}
+
+static unsigned int kdbus_handle_poll(struct file *file,
+ struct poll_table_struct *wait)
+{
+ struct kdbus_handle *handle = file->private_data;
+ enum kdbus_handle_type type;
+ unsigned int mask = POLLOUT | POLLWRNORM;
+
+ /*
+ * This pairs with smp_wmb() during handle setup. It guarantees that
+ * _iff_ the handle type is set, handle->conn is valid. Furthermore,
+ * _iff_ the type is set, the handle object is constant and never
+ * changed again. If it's not set, we must not access the handle but
+ * bail out. We also must assume no setup has taken place, yet.
+ */
+ type = handle->type;
+ smp_rmb();
+
+ /* Only a connected endpoint can read/write data */
+ if (type != KDBUS_HANDLE_CONNECTED)
+ return POLLERR | POLLHUP;
+
+ poll_wait(file, &handle->conn->wait, wait);
+
+ /*
+ * Verify the connection hasn't been deactivated _after_ adding the
+ * wait-queue. This guarantees, that if the connection is deactivated
+ * after we checked it, the waitqueue is signaled and we're called
+ * again.
+ */
+ if (!kdbus_conn_active(handle->conn))
+ return POLLERR | POLLHUP;
+
+ if (!list_empty(&handle->conn->queue.msg_list) ||
+ atomic_read(&handle->conn->lost_count) > 0)
+ mask |= POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+static int kdbus_handle_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct kdbus_handle *handle = file->private_data;
+ enum kdbus_handle_type type;
+ int ret = -EBADFD;
+
+ /*
+ * This pairs with smp_wmb() during handle setup. It guarantees that
+ * _iff_ the handle type is set, handle->conn is valid. Furthermore,
+ * _iff_ the type is set, the handle object is constant and never
+ * changed again. If it's not set, we must not access the handle but
+ * bail out. We also must assume no setup has taken place, yet.
+ */
+ type = handle->type;
+ smp_rmb();
+
+ /* Only connected handles have a pool we can map */
+ if (type == KDBUS_HANDLE_CONNECTED)
+ ret = kdbus_pool_mmap(handle->conn->pool, vma);
+
+ return ret;
+}
+
+const struct file_operations kdbus_handle_ops = {
+ .owner = THIS_MODULE,
+ .open = kdbus_handle_open,
+ .release = kdbus_handle_release,
+ .poll = kdbus_handle_poll,
+ .llseek = noop_llseek,
+ .unlocked_ioctl = kdbus_handle_ioctl,
+ .mmap = kdbus_handle_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = kdbus_handle_ioctl,
+#endif
+};
diff --git a/ipc/kdbus/handle.h b/ipc/kdbus/handle.h
new file mode 100644
index 000000000..8a36c0595
--- /dev/null
+++ b/ipc/kdbus/handle.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_HANDLE_H
+#define __KDBUS_HANDLE_H
+
+#include <linux/fs.h>
+#include <uapi/linux/kdbus.h>
+
+extern const struct file_operations kdbus_handle_ops;
+
+/**
+ * kdbus_arg - information and state of a single ioctl command item
+ * @type: item type
+ * @item: set by the parser to the first found item of this type
+ * @multiple: whether multiple items of this type are allowed
+ * @mandatory: whether at least one item of this type is required
+ *
+ * This structure describes a single item in an ioctl command payload. The
+ * caller has to pre-fill the type and flags, the parser will then use this
+ * information to verify the ioctl payload. @item is set by the parser to point
+ * to the first occurrence of the item.
+ */
+struct kdbus_arg {
+ u64 type;
+ struct kdbus_item *item;
+ bool multiple : 1;
+ bool mandatory : 1;
+};
+
+/**
+ * kdbus_args - information and state of ioctl command parser
+ * @allowed_flags: set of flags this command supports
+ * @argc: number of items in @argv
+ * @argv: array of items this command supports
+ * @user: set by parser to user-space location of current command
+ * @cmd: set by parser to kernel copy of command payload
+ * @cmd_buf: 512 bytes inline buf to avoid kmalloc() on small cmds
+ * @items: points to item array in @cmd
+ * @items_size: size of @items in bytes
+ * @is_cmd: whether this is a command-payload or msg-payload
+ *
+ * This structure is used to parse ioctl command payloads on each invocation.
+ * The ioctl handler has to pre-fill the flags and allowed items before passing
+ * the object to kdbus_args_parse(). The parser will copy the command payload
+ * into kernel-space and verify the correctness of the data.
+ *
+ * We use a 512 bytes buffer for small command payloads, to be allocated on
+ * stack on syscall entrance.
+ */
+struct kdbus_args {
+ u64 allowed_flags;
+ size_t argc;
+ struct kdbus_arg *argv;
+
+ struct kdbus_cmd __user *user;
+ struct kdbus_cmd *cmd;
+ u8 cmd_buf[512];
+
+ struct kdbus_item *items;
+ size_t items_size;
+ bool is_cmd : 1;
+};
+
+int __kdbus_args_parse(struct kdbus_args *args, bool is_cmd, void __user *argp,
+ size_t type_size, size_t items_offset, void **out);
+int kdbus_args_clear(struct kdbus_args *args, int ret);
+
+#define kdbus_args_parse(_args, _argp, _v) \
+ ({ \
+ BUILD_BUG_ON(offsetof(typeof(**(_v)), size) != \
+ offsetof(struct kdbus_cmd, size)); \
+ BUILD_BUG_ON(offsetof(typeof(**(_v)), flags) != \
+ offsetof(struct kdbus_cmd, flags)); \
+ BUILD_BUG_ON(offsetof(typeof(**(_v)), return_flags) != \
+ offsetof(struct kdbus_cmd, return_flags)); \
+ __kdbus_args_parse((_args), 1, (_argp), sizeof(**(_v)), \
+ offsetof(typeof(**(_v)), items), \
+ (void **)(_v)); \
+ })
+
+#define kdbus_args_parse_msg(_args, _argp, _v) \
+ ({ \
+ BUILD_BUG_ON(offsetof(typeof(**(_v)), size) != \
+ offsetof(struct kdbus_cmd, size)); \
+ BUILD_BUG_ON(offsetof(typeof(**(_v)), flags) != \
+ offsetof(struct kdbus_cmd, flags)); \
+ __kdbus_args_parse((_args), 0, (_argp), sizeof(**(_v)), \
+ offsetof(typeof(**(_v)), items), \
+ (void **)(_v)); \
+ })
+
+#endif
diff --git a/ipc/kdbus/item.c b/ipc/kdbus/item.c
new file mode 100644
index 000000000..ce78dba03
--- /dev/null
+++ b/ipc/kdbus/item.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+
+#include "item.h"
+#include "limits.h"
+#include "util.h"
+
+/*
+ * This verifies the string at position @str with size @size is properly
+ * zero-terminated and does not contain a 0-byte but at the end.
+ */
+static bool kdbus_str_valid(const char *str, size_t size)
+{
+ return size > 0 && memchr(str, '\0', size) == str + size - 1;
+}
+
+/**
+ * kdbus_item_validate_name() - validate an item containing a name
+ * @item: Item to validate
+ *
+ * Return: zero on success or an negative error code on failure
+ */
+int kdbus_item_validate_name(const struct kdbus_item *item)
+{
+ const char *name = item->str;
+ unsigned int i;
+ size_t len;
+
+ if (item->size < KDBUS_ITEM_HEADER_SIZE + 2)
+ return -EINVAL;
+
+ if (item->size > KDBUS_ITEM_HEADER_SIZE +
+ KDBUS_SYSNAME_MAX_LEN + 1)
+ return -ENAMETOOLONG;
+
+ if (!kdbus_str_valid(name, KDBUS_ITEM_PAYLOAD_SIZE(item)))
+ return -EINVAL;
+
+ len = strlen(name);
+ if (len == 0)
+ return -EINVAL;
+
+ for (i = 0; i < len; i++) {
+ if (isalpha(name[i]))
+ continue;
+ if (isdigit(name[i]))
+ continue;
+ if (name[i] == '_')
+ continue;
+ if (i > 0 && i + 1 < len && (name[i] == '-' || name[i] == '.'))
+ continue;
+
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * kdbus_item_validate() - validate a single item
+ * @item: item to validate
+ *
+ * Return: 0 if item is valid, negative error code if not.
+ */
+int kdbus_item_validate(const struct kdbus_item *item)
+{
+ size_t payload_size = KDBUS_ITEM_PAYLOAD_SIZE(item);
+ size_t l;
+ int ret;
+
+ BUILD_BUG_ON(KDBUS_ITEM_HEADER_SIZE !=
+ sizeof(struct kdbus_item_header));
+
+ if (item->size < KDBUS_ITEM_HEADER_SIZE)
+ return -EINVAL;
+
+ switch (item->type) {
+ case KDBUS_ITEM_NEGOTIATE:
+ if (payload_size % sizeof(u64) != 0)
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_PAYLOAD_VEC:
+ case KDBUS_ITEM_PAYLOAD_OFF:
+ if (payload_size != sizeof(struct kdbus_vec))
+ return -EINVAL;
+ if (item->vec.size == 0 || item->vec.size > SIZE_MAX)
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_PAYLOAD_MEMFD:
+ if (payload_size != sizeof(struct kdbus_memfd))
+ return -EINVAL;
+ if (item->memfd.size == 0 || item->memfd.size > SIZE_MAX)
+ return -EINVAL;
+ if (item->memfd.fd < 0)
+ return -EBADF;
+ break;
+
+ case KDBUS_ITEM_FDS:
+ if (payload_size % sizeof(int) != 0)
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_CANCEL_FD:
+ if (payload_size != sizeof(int))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_BLOOM_PARAMETER:
+ if (payload_size != sizeof(struct kdbus_bloom_parameter))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_BLOOM_FILTER:
+ /* followed by the bloom-mask, depends on the bloom-size */
+ if (payload_size < sizeof(struct kdbus_bloom_filter))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_BLOOM_MASK:
+ /* size depends on bloom-size of bus */
+ break;
+
+ case KDBUS_ITEM_CONN_DESCRIPTION:
+ case KDBUS_ITEM_MAKE_NAME:
+ ret = kdbus_item_validate_name(item);
+ if (ret < 0)
+ return ret;
+ break;
+
+ case KDBUS_ITEM_ATTACH_FLAGS_SEND:
+ case KDBUS_ITEM_ATTACH_FLAGS_RECV:
+ case KDBUS_ITEM_ID:
+ case KDBUS_ITEM_DST_ID:
+ if (payload_size != sizeof(u64))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_TIMESTAMP:
+ if (payload_size != sizeof(struct kdbus_timestamp))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_CREDS:
+ if (payload_size != sizeof(struct kdbus_creds))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_AUXGROUPS:
+ if (payload_size % sizeof(u32) != 0)
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_NAME:
+ case KDBUS_ITEM_DST_NAME:
+ case KDBUS_ITEM_PID_COMM:
+ case KDBUS_ITEM_TID_COMM:
+ case KDBUS_ITEM_EXE:
+ case KDBUS_ITEM_CMDLINE:
+ case KDBUS_ITEM_CGROUP:
+ case KDBUS_ITEM_SECLABEL:
+ if (!kdbus_str_valid(item->str, payload_size))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_CAPS:
+ if (payload_size < sizeof(u32))
+ return -EINVAL;
+ if (payload_size < sizeof(u32) +
+ 4 * CAP_TO_INDEX(item->caps.last_cap) * sizeof(u32))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_AUDIT:
+ if (payload_size != sizeof(struct kdbus_audit))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_POLICY_ACCESS:
+ if (payload_size != sizeof(struct kdbus_policy_access))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_NAME_ADD:
+ case KDBUS_ITEM_NAME_REMOVE:
+ case KDBUS_ITEM_NAME_CHANGE:
+ if (payload_size < sizeof(struct kdbus_notify_name_change))
+ return -EINVAL;
+ l = payload_size - offsetof(struct kdbus_notify_name_change,
+ name);
+ if (l > 0 && !kdbus_str_valid(item->name_change.name, l))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_ID_ADD:
+ case KDBUS_ITEM_ID_REMOVE:
+ if (payload_size != sizeof(struct kdbus_notify_id_change))
+ return -EINVAL;
+ break;
+
+ case KDBUS_ITEM_REPLY_TIMEOUT:
+ case KDBUS_ITEM_REPLY_DEAD:
+ if (payload_size != 0)
+ return -EINVAL;
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * kdbus_items_validate() - validate items passed by user-space
+ * @items: items to validate
+ * @items_size: number of items
+ *
+ * This verifies that the passed items pointer is consistent and valid.
+ * Furthermore, each item is checked for:
+ * - valid "size" value
+ * - payload is of expected type
+ * - payload is fully included in the item
+ * - string payloads are zero-terminated
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_items_validate(const struct kdbus_item *items, size_t items_size)
+{
+ const struct kdbus_item *item;
+ int ret;
+
+ KDBUS_ITEMS_FOREACH(item, items, items_size) {
+ if (!KDBUS_ITEM_VALID(item, items, items_size))
+ return -EINVAL;
+
+ ret = kdbus_item_validate(item);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (!KDBUS_ITEMS_END(item, items, items_size))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * kdbus_item_set() - Set item content
+ * @item: The item to modify
+ * @type: The item type to set (KDBUS_ITEM_*)
+ * @data: Data to copy to item->data, may be %NULL
+ * @len: Number of bytes in @data
+ *
+ * This sets type, size and data fields of an item. If @data is NULL, the data
+ * memory is cleared.
+ *
+ * Note that you must align your @data memory to 8 bytes. Trailing padding (in
+ * case @len is not 8byte aligned) is cleared by this call.
+ *
+ * Returns: Pointer to the following item.
+ */
+struct kdbus_item *kdbus_item_set(struct kdbus_item *item, u64 type,
+ const void *data, size_t len)
+{
+ item->type = type;
+ item->size = KDBUS_ITEM_HEADER_SIZE + len;
+
+ if (data) {
+ memcpy(item->data, data, len);
+ memset(item->data + len, 0, KDBUS_ALIGN8(len) - len);
+ } else {
+ memset(item->data, 0, KDBUS_ALIGN8(len));
+ }
+
+ return KDBUS_ITEM_NEXT(item);
+}
diff --git a/ipc/kdbus/item.h b/ipc/kdbus/item.h
new file mode 100644
index 000000000..3a7e6ccc2
--- /dev/null
+++ b/ipc/kdbus/item.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_ITEM_H
+#define __KDBUS_ITEM_H
+
+#include <linux/kernel.h>
+#include <uapi/linux/kdbus.h>
+
+#include "util.h"
+
+/* generic access and iterators over a stream of items */
+#define KDBUS_ITEM_NEXT(_i) (typeof(_i))((u8 *)(_i) + KDBUS_ALIGN8((_i)->size))
+#define KDBUS_ITEMS_SIZE(_h, _is) ((_h)->size - offsetof(typeof(*(_h)), _is))
+#define KDBUS_ITEM_HEADER_SIZE offsetof(struct kdbus_item, data)
+#define KDBUS_ITEM_SIZE(_s) KDBUS_ALIGN8(KDBUS_ITEM_HEADER_SIZE + (_s))
+#define KDBUS_ITEM_PAYLOAD_SIZE(_i) ((_i)->size - KDBUS_ITEM_HEADER_SIZE)
+
+#define KDBUS_ITEMS_FOREACH(_i, _is, _s) \
+ for ((_i) = (_is); \
+ ((u8 *)(_i) < (u8 *)(_is) + (_s)) && \
+ ((u8 *)(_i) >= (u8 *)(_is)); \
+ (_i) = KDBUS_ITEM_NEXT(_i))
+
+#define KDBUS_ITEM_VALID(_i, _is, _s) \
+ ((_i)->size >= KDBUS_ITEM_HEADER_SIZE && \
+ (u8 *)(_i) + (_i)->size > (u8 *)(_i) && \
+ (u8 *)(_i) + (_i)->size <= (u8 *)(_is) + (_s) && \
+ (u8 *)(_i) >= (u8 *)(_is))
+
+#define KDBUS_ITEMS_END(_i, _is, _s) \
+ ((u8 *)(_i) == ((u8 *)(_is) + KDBUS_ALIGN8(_s)))
+
+/**
+ * struct kdbus_item_header - Describes the fix part of an item
+ * @size: The total size of the item
+ * @type: The item type, one of KDBUS_ITEM_*
+ */
+struct kdbus_item_header {
+ u64 size;
+ u64 type;
+};
+
+int kdbus_item_validate_name(const struct kdbus_item *item);
+int kdbus_item_validate(const struct kdbus_item *item);
+int kdbus_items_validate(const struct kdbus_item *items, size_t items_size);
+struct kdbus_item *kdbus_item_set(struct kdbus_item *item, u64 type,
+ const void *data, size_t len);
+
+#endif
diff --git a/ipc/kdbus/limits.h b/ipc/kdbus/limits.h
new file mode 100644
index 000000000..c54925a25
--- /dev/null
+++ b/ipc/kdbus/limits.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_DEFAULTS_H
+#define __KDBUS_DEFAULTS_H
+
+#include <linux/kernel.h>
+
+/* maximum size of message header and items */
+#define KDBUS_MSG_MAX_SIZE SZ_8K
+
+/* maximum number of memfd items per message */
+#define KDBUS_MSG_MAX_MEMFD_ITEMS 16
+
+/* max size of ioctl command data */
+#define KDBUS_CMD_MAX_SIZE SZ_32K
+
+/* maximum number of inflight fds in a target queue per user */
+#define KDBUS_CONN_MAX_FDS_PER_USER 16
+
+/* maximum message payload size */
+#define KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE SZ_2M
+
+/* maximum size of bloom bit field in bytes */
+#define KDBUS_BUS_BLOOM_MAX_SIZE SZ_4K
+
+/* maximum length of well-known bus name */
+#define KDBUS_NAME_MAX_LEN 255
+
+/* maximum length of bus, domain, ep name */
+#define KDBUS_SYSNAME_MAX_LEN 63
+
+/* maximum number of matches per connection */
+#define KDBUS_MATCH_MAX 256
+
+/* maximum number of queued messages from the same individual user */
+#define KDBUS_CONN_MAX_MSGS 256
+
+/* maximum number of well-known names per connection */
+#define KDBUS_CONN_MAX_NAMES 256
+
+/* maximum number of queued requests waiting for a reply */
+#define KDBUS_CONN_MAX_REQUESTS_PENDING 128
+
+/* maximum number of connections per user in one domain */
+#define KDBUS_USER_MAX_CONN 1024
+
+/* maximum number of buses per user in one domain */
+#define KDBUS_USER_MAX_BUSES 16
+
+#endif
diff --git a/ipc/kdbus/main.c b/ipc/kdbus/main.c
new file mode 100644
index 000000000..1ad4dc8da
--- /dev/null
+++ b/ipc/kdbus/main.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include "util.h"
+#include "fs.h"
+#include "handle.h"
+#include "metadata.h"
+#include "node.h"
+
+/*
+ * This is a simplified outline of the internal kdbus object relations, for
+ * those interested in the inner life of the driver implementation.
+ *
+ * From a mount point's (domain's) perspective:
+ *
+ * struct kdbus_domain
+ * |» struct kdbus_user *user (many, owned)
+ * '» struct kdbus_node node (embedded)
+ * |» struct kdbus_node children (many, referenced)
+ * |» struct kdbus_node *parent (pinned)
+ * '» struct kdbus_bus (many, pinned)
+ * |» struct kdbus_node node (embedded)
+ * '» struct kdbus_ep (many, pinned)
+ * |» struct kdbus_node node (embedded)
+ * |» struct kdbus_bus *bus (pinned)
+ * |» struct kdbus_conn conn_list (many, pinned)
+ * | |» struct kdbus_ep *ep (pinned)
+ * | |» struct kdbus_name_entry *activator_of (owned)
+ * | |» struct kdbus_match_db *match_db (owned)
+ * | |» struct kdbus_meta *meta (owned)
+ * | |» struct kdbus_match_db *match_db (owned)
+ * | | '» struct kdbus_match_entry (many, owned)
+ * | |
+ * | |» struct kdbus_pool *pool (owned)
+ * | | '» struct kdbus_pool_slice *slices (many, owned)
+ * | | '» struct kdbus_pool *pool (pinned)
+ * | |
+ * | |» struct kdbus_user *user (pinned)
+ * | `» struct kdbus_queue_entry entries (many, embedded)
+ * | |» struct kdbus_pool_slice *slice (pinned)
+ * | |» struct kdbus_conn_reply *reply (owned)
+ * | '» struct kdbus_user *user (pinned)
+ * |
+ * '» struct kdbus_user *user (pinned)
+ * '» struct kdbus_policy_db policy_db (embedded)
+ * |» struct kdbus_policy_db_entry (many, owned)
+ * | |» struct kdbus_conn (pinned)
+ * | '» struct kdbus_ep (pinned)
+ * |
+ * '» struct kdbus_policy_db_cache_entry (many, owned)
+ * '» struct kdbus_conn (pinned)
+ *
+ * For the life-time of a file descriptor derived from calling open() on a file
+ * inside the mount point:
+ *
+ * struct kdbus_handle
+ * |» struct kdbus_meta *meta (owned)
+ * |» struct kdbus_ep *ep (pinned)
+ * |» struct kdbus_conn *conn (owned)
+ * '» struct kdbus_ep *ep (owned)
+ */
+
+/* kdbus mount-point /sys/fs/kdbus */
+static struct kobject *kdbus_dir;
+
+static int __init kdbus_init(void)
+{
+ int ret;
+
+ kdbus_dir = kobject_create_and_add(KBUILD_MODNAME, fs_kobj);
+ if (!kdbus_dir)
+ return -ENOMEM;
+
+ ret = kdbus_fs_init();
+ if (ret < 0) {
+ pr_err("cannot register filesystem: %d\n", ret);
+ goto exit_dir;
+ }
+
+ pr_info("initialized\n");
+ return 0;
+
+exit_dir:
+ kobject_put(kdbus_dir);
+ return ret;
+}
+
+static void __exit kdbus_exit(void)
+{
+ kdbus_fs_exit();
+ kobject_put(kdbus_dir);
+ ida_destroy(&kdbus_node_ida);
+}
+
+module_init(kdbus_init);
+module_exit(kdbus_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("D-Bus, powerful, easy to use interprocess communication");
+MODULE_ALIAS_FS(KBUILD_MODNAME "fs");
diff --git a/ipc/kdbus/match.c b/ipc/kdbus/match.c
new file mode 100644
index 000000000..4ee6a1f2e
--- /dev/null
+++ b/ipc/kdbus/match.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "match.h"
+#include "message.h"
+#include "names.h"
+
+/**
+ * struct kdbus_match_db - message filters
+ * @entries_list: List of matches
+ * @mdb_rwlock: Match data lock
+ * @entries_count: Number of entries in database
+ */
+struct kdbus_match_db {
+ struct list_head entries_list;
+ struct rw_semaphore mdb_rwlock;
+ unsigned int entries_count;
+};
+
+/**
+ * struct kdbus_match_entry - a match database entry
+ * @cookie: User-supplied cookie to lookup the entry
+ * @list_entry: The list entry element for the db list
+ * @rules_list: The list head for tracking rules of this entry
+ */
+struct kdbus_match_entry {
+ u64 cookie;
+ struct list_head list_entry;
+ struct list_head rules_list;
+};
+
+/**
+ * struct kdbus_bloom_mask - mask to match against filter
+ * @generations: Number of generations carried
+ * @data: Array of bloom bit fields
+ */
+struct kdbus_bloom_mask {
+ u64 generations;
+ u64 *data;
+};
+
+/**
+ * struct kdbus_match_rule - a rule appended to a match entry
+ * @type: An item type to match against
+ * @bloom_mask: Bloom mask to match a message's filter against, used
+ * with KDBUS_ITEM_BLOOM_MASK
+ * @name: Name to match against, used with KDBUS_ITEM_NAME,
+ * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}
+ * @old_id: ID to match against, used with
+ * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE},
+ * KDBUS_ITEM_ID_REMOVE
+ * @new_id: ID to match against, used with
+ * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE},
+ * KDBUS_ITEM_ID_REMOVE
+ * @src_id: ID to match against, used with KDBUS_ITEM_ID
+ * @dst_id: Message destination ID, used with KDBUS_ITEM_DST_ID
+ * @rules_entry: Entry in the entry's rules list
+ */
+struct kdbus_match_rule {
+ u64 type;
+ union {
+ struct kdbus_bloom_mask bloom_mask;
+ struct {
+ char *name;
+ u64 old_id;
+ u64 new_id;
+ };
+ u64 src_id;
+ u64 dst_id;
+ };
+ struct list_head rules_entry;
+};
+
+static void kdbus_match_rule_free(struct kdbus_match_rule *rule)
+{
+ if (!rule)
+ return;
+
+ switch (rule->type) {
+ case KDBUS_ITEM_BLOOM_MASK:
+ kfree(rule->bloom_mask.data);
+ break;
+
+ case KDBUS_ITEM_NAME:
+ case KDBUS_ITEM_NAME_ADD:
+ case KDBUS_ITEM_NAME_REMOVE:
+ case KDBUS_ITEM_NAME_CHANGE:
+ kfree(rule->name);
+ break;
+
+ case KDBUS_ITEM_ID:
+ case KDBUS_ITEM_DST_ID:
+ case KDBUS_ITEM_ID_ADD:
+ case KDBUS_ITEM_ID_REMOVE:
+ break;
+
+ default:
+ BUG();
+ }
+
+ list_del(&rule->rules_entry);
+ kfree(rule);
+}
+
+static void kdbus_match_entry_free(struct kdbus_match_entry *entry)
+{
+ struct kdbus_match_rule *r, *tmp;
+
+ if (!entry)
+ return;
+
+ list_for_each_entry_safe(r, tmp, &entry->rules_list, rules_entry)
+ kdbus_match_rule_free(r);
+
+ list_del(&entry->list_entry);
+ kfree(entry);
+}
+
+/**
+ * kdbus_match_db_free() - free match db resources
+ * @mdb: The match database
+ */
+void kdbus_match_db_free(struct kdbus_match_db *mdb)
+{
+ struct kdbus_match_entry *entry, *tmp;
+
+ if (!mdb)
+ return;
+
+ list_for_each_entry_safe(entry, tmp, &mdb->entries_list, list_entry)
+ kdbus_match_entry_free(entry);
+
+ kfree(mdb);
+}
+
+/**
+ * kdbus_match_db_new() - create a new match database
+ *
+ * Return: a new kdbus_match_db on success, ERR_PTR on failure.
+ */
+struct kdbus_match_db *kdbus_match_db_new(void)
+{
+ struct kdbus_match_db *d;
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return ERR_PTR(-ENOMEM);
+
+ init_rwsem(&d->mdb_rwlock);
+ INIT_LIST_HEAD(&d->entries_list);
+
+ return d;
+}
+
+static bool kdbus_match_bloom(const struct kdbus_bloom_filter *filter,
+ const struct kdbus_bloom_mask *mask,
+ const struct kdbus_conn *conn)
+{
+ size_t n = conn->ep->bus->bloom.size / sizeof(u64);
+ const u64 *m;
+ size_t i;
+
+ /*
+ * The message's filter carries a generation identifier, the
+ * match's mask possibly carries an array of multiple generations
+ * of the mask. Select the mask with the closest match of the
+ * filter's generation.
+ */
+ m = mask->data + (min(filter->generation, mask->generations - 1) * n);
+
+ /*
+ * The message's filter contains the messages properties,
+ * the match's mask contains the properties to look for in the
+ * message. Check the mask bit field against the filter bit field,
+ * if the message possibly carries the properties the connection
+ * has subscribed to.
+ */
+ for (i = 0; i < n; i++)
+ if ((filter->data[i] & m[i]) != m[i])
+ return false;
+
+ return true;
+}
+
+static bool kdbus_match_rule_conn(const struct kdbus_match_rule *r,
+ struct kdbus_conn *c,
+ const struct kdbus_staging *s)
+{
+ lockdep_assert_held(&c->ep->bus->name_registry->rwlock);
+
+ switch (r->type) {
+ case KDBUS_ITEM_BLOOM_MASK:
+ return kdbus_match_bloom(s->bloom_filter, &r->bloom_mask, c);
+ case KDBUS_ITEM_ID:
+ return r->src_id == c->id || r->src_id == KDBUS_MATCH_ID_ANY;
+ case KDBUS_ITEM_DST_ID:
+ return r->dst_id == s->msg->dst_id ||
+ r->dst_id == KDBUS_MATCH_ID_ANY;
+ case KDBUS_ITEM_NAME:
+ return kdbus_conn_has_name(c, r->name);
+ default:
+ return false;
+ }
+}
+
+static bool kdbus_match_rule_kernel(const struct kdbus_match_rule *r,
+ const struct kdbus_staging *s)
+{
+ struct kdbus_item *n = s->notify;
+
+ if (WARN_ON(!n) || n->type != r->type)
+ return false;
+
+ switch (r->type) {
+ case KDBUS_ITEM_ID_ADD:
+ return r->new_id == KDBUS_MATCH_ID_ANY ||
+ r->new_id == n->id_change.id;
+ case KDBUS_ITEM_ID_REMOVE:
+ return r->old_id == KDBUS_MATCH_ID_ANY ||
+ r->old_id == n->id_change.id;
+ case KDBUS_ITEM_NAME_ADD:
+ case KDBUS_ITEM_NAME_CHANGE:
+ case KDBUS_ITEM_NAME_REMOVE:
+ return (r->old_id == KDBUS_MATCH_ID_ANY ||
+ r->old_id == n->name_change.old_id.id) &&
+ (r->new_id == KDBUS_MATCH_ID_ANY ||
+ r->new_id == n->name_change.new_id.id) &&
+ (!r->name || !strcmp(r->name, n->name_change.name));
+ default:
+ return false;
+ }
+}
+
+static bool kdbus_match_rules(const struct kdbus_match_entry *entry,
+ struct kdbus_conn *c,
+ const struct kdbus_staging *s)
+{
+ struct kdbus_match_rule *r;
+
+ list_for_each_entry(r, &entry->rules_list, rules_entry)
+ if ((c && !kdbus_match_rule_conn(r, c, s)) ||
+ (!c && !kdbus_match_rule_kernel(r, s)))
+ return false;
+
+ return true;
+}
+
+/**
+ * kdbus_match_db_match_msg() - match a msg object agains the database entries
+ * @mdb: The match database
+ * @conn_src: The connection object originating the message
+ * @staging: Staging object containing the message to match against
+ *
+ * This function will walk through all the database entries previously uploaded
+ * with kdbus_match_db_add(). As soon as any of them has an all-satisfied rule
+ * set, this function will return true.
+ *
+ * The caller must hold the registry lock of conn_src->ep->bus, in case conn_src
+ * is non-NULL.
+ *
+ * Return: true if there was a matching database entry, false otherwise.
+ */
+bool kdbus_match_db_match_msg(struct kdbus_match_db *mdb,
+ struct kdbus_conn *conn_src,
+ const struct kdbus_staging *staging)
+{
+ struct kdbus_match_entry *entry;
+ bool matched = false;
+
+ down_read(&mdb->mdb_rwlock);
+ list_for_each_entry(entry, &mdb->entries_list, list_entry) {
+ matched = kdbus_match_rules(entry, conn_src, staging);
+ if (matched)
+ break;
+ }
+ up_read(&mdb->mdb_rwlock);
+
+ return matched;
+}
+
+static int kdbus_match_db_remove_unlocked(struct kdbus_match_db *mdb,
+ u64 cookie)
+{
+ struct kdbus_match_entry *entry, *tmp;
+ bool found = false;
+
+ list_for_each_entry_safe(entry, tmp, &mdb->entries_list, list_entry)
+ if (entry->cookie == cookie) {
+ kdbus_match_entry_free(entry);
+ --mdb->entries_count;
+ found = true;
+ }
+
+ return found ? 0 : -EBADSLT;
+}
+
+/**
+ * kdbus_cmd_match_add() - handle KDBUS_CMD_MATCH_ADD
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * One call to this function (or one ioctl(KDBUS_CMD_MATCH_ADD), respectively,
+ * adds one new database entry with n rules attached to it. Each rule is
+ * described with an kdbus_item, and an entry is considered matching if all
+ * its rules are satisfied.
+ *
+ * The items attached to a kdbus_cmd_match struct have the following mapping:
+ *
+ * KDBUS_ITEM_BLOOM_MASK: A bloom mask
+ * KDBUS_ITEM_NAME: A connection's source name
+ * KDBUS_ITEM_ID: A connection ID
+ * KDBUS_ITEM_DST_ID: A connection ID
+ * KDBUS_ITEM_NAME_ADD:
+ * KDBUS_ITEM_NAME_REMOVE:
+ * KDBUS_ITEM_NAME_CHANGE: Well-known name changes, carry
+ * kdbus_notify_name_change
+ * KDBUS_ITEM_ID_ADD:
+ * KDBUS_ITEM_ID_REMOVE: Connection ID changes, carry
+ * kdbus_notify_id_change
+ *
+ * For kdbus_notify_{id,name}_change structs, only the ID and name fields
+ * are looked at when adding an entry. The flags are unused.
+ *
+ * Also note that KDBUS_ITEM_BLOOM_MASK, KDBUS_ITEM_NAME, KDBUS_ITEM_ID,
+ * and KDBUS_ITEM_DST_ID are used to match messages from userspace, while the
+ * others apply to kernel-generated notifications.
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_match_add(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_match_db *mdb = conn->match_db;
+ struct kdbus_match_entry *entry = NULL;
+ struct kdbus_cmd_match *cmd;
+ struct kdbus_item *item;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_BLOOM_MASK, .multiple = true },
+ { .type = KDBUS_ITEM_NAME, .multiple = true },
+ { .type = KDBUS_ITEM_ID, .multiple = true },
+ { .type = KDBUS_ITEM_DST_ID, .multiple = true },
+ { .type = KDBUS_ITEM_NAME_ADD, .multiple = true },
+ { .type = KDBUS_ITEM_NAME_REMOVE, .multiple = true },
+ { .type = KDBUS_ITEM_NAME_CHANGE, .multiple = true },
+ { .type = KDBUS_ITEM_ID_ADD, .multiple = true },
+ { .type = KDBUS_ITEM_ID_REMOVE, .multiple = true },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE |
+ KDBUS_MATCH_REPLACE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ if (!kdbus_conn_is_ordinary(conn))
+ return -EOPNOTSUPP;
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ entry->cookie = cmd->cookie;
+ INIT_LIST_HEAD(&entry->list_entry);
+ INIT_LIST_HEAD(&entry->rules_list);
+
+ KDBUS_ITEMS_FOREACH(item, cmd->items, KDBUS_ITEMS_SIZE(cmd, items)) {
+ struct kdbus_match_rule *rule;
+ size_t size = item->size - offsetof(struct kdbus_item, data);
+
+ rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+ if (!rule) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ rule->type = item->type;
+ INIT_LIST_HEAD(&rule->rules_entry);
+
+ switch (item->type) {
+ case KDBUS_ITEM_BLOOM_MASK: {
+ u64 bsize = conn->ep->bus->bloom.size;
+ u64 generations;
+ u64 remainder;
+
+ generations = div64_u64_rem(size, bsize, &remainder);
+ if (size < bsize || remainder > 0) {
+ ret = -EDOM;
+ break;
+ }
+
+ rule->bloom_mask.data = kmemdup(item->data,
+ size, GFP_KERNEL);
+ if (!rule->bloom_mask.data) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ rule->bloom_mask.generations = generations;
+ break;
+ }
+
+ case KDBUS_ITEM_NAME:
+ if (!kdbus_name_is_valid(item->str, false)) {
+ ret = -EINVAL;
+ break;
+ }
+
+ rule->name = kstrdup(item->str, GFP_KERNEL);
+ if (!rule->name)
+ ret = -ENOMEM;
+
+ break;
+
+ case KDBUS_ITEM_ID:
+ rule->src_id = item->id;
+ break;
+
+ case KDBUS_ITEM_DST_ID:
+ rule->dst_id = item->id;
+ break;
+
+ case KDBUS_ITEM_NAME_ADD:
+ case KDBUS_ITEM_NAME_REMOVE:
+ case KDBUS_ITEM_NAME_CHANGE:
+ rule->old_id = item->name_change.old_id.id;
+ rule->new_id = item->name_change.new_id.id;
+
+ if (size > sizeof(struct kdbus_notify_name_change)) {
+ rule->name = kstrdup(item->name_change.name,
+ GFP_KERNEL);
+ if (!rule->name)
+ ret = -ENOMEM;
+ }
+
+ break;
+
+ case KDBUS_ITEM_ID_ADD:
+ case KDBUS_ITEM_ID_REMOVE:
+ if (item->type == KDBUS_ITEM_ID_ADD)
+ rule->new_id = item->id_change.id;
+ else
+ rule->old_id = item->id_change.id;
+
+ break;
+ }
+
+ if (ret < 0) {
+ kdbus_match_rule_free(rule);
+ goto exit;
+ }
+
+ list_add_tail(&rule->rules_entry, &entry->rules_list);
+ }
+
+ down_write(&mdb->mdb_rwlock);
+
+ /* Remove any entry that has the same cookie as the current one. */
+ if (cmd->flags & KDBUS_MATCH_REPLACE)
+ kdbus_match_db_remove_unlocked(mdb, entry->cookie);
+
+ /*
+ * If the above removal caught any entry, there will be room for the
+ * new one.
+ */
+ if (++mdb->entries_count > KDBUS_MATCH_MAX) {
+ --mdb->entries_count;
+ ret = -EMFILE;
+ } else {
+ list_add_tail(&entry->list_entry, &mdb->entries_list);
+ entry = NULL;
+ }
+
+ up_write(&mdb->mdb_rwlock);
+
+exit:
+ kdbus_match_entry_free(entry);
+ return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_match_remove() - handle KDBUS_CMD_MATCH_REMOVE
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_match_remove(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_cmd_match *cmd;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ if (!kdbus_conn_is_ordinary(conn))
+ return -EOPNOTSUPP;
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ down_write(&conn->match_db->mdb_rwlock);
+ ret = kdbus_match_db_remove_unlocked(conn->match_db, cmd->cookie);
+ up_write(&conn->match_db->mdb_rwlock);
+
+ return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/match.h b/ipc/kdbus/match.h
new file mode 100644
index 000000000..ceb492f8e
--- /dev/null
+++ b/ipc/kdbus/match.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_MATCH_H
+#define __KDBUS_MATCH_H
+
+struct kdbus_conn;
+struct kdbus_match_db;
+struct kdbus_staging;
+
+struct kdbus_match_db *kdbus_match_db_new(void);
+void kdbus_match_db_free(struct kdbus_match_db *db);
+int kdbus_match_db_add(struct kdbus_conn *conn,
+ struct kdbus_cmd_match *cmd);
+int kdbus_match_db_remove(struct kdbus_conn *conn,
+ struct kdbus_cmd_match *cmd);
+bool kdbus_match_db_match_msg(struct kdbus_match_db *db,
+ struct kdbus_conn *conn_src,
+ const struct kdbus_staging *staging);
+
+int kdbus_cmd_match_add(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_match_remove(struct kdbus_conn *conn, void __user *argp);
+
+#endif
diff --git a/ipc/kdbus/message.c b/ipc/kdbus/message.c
new file mode 100644
index 000000000..432dba4dc
--- /dev/null
+++ b/ipc/kdbus/message.c
@@ -0,0 +1,1040 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/shmem_fs.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <net/sock.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "match.h"
+#include "message.h"
+#include "names.h"
+#include "policy.h"
+
+static const char * const zeros = "\0\0\0\0\0\0\0";
+
+static struct kdbus_gaps *kdbus_gaps_new(size_t n_memfds, size_t n_fds)
+{
+ size_t size_offsets, size_memfds, size_fds, size;
+ struct kdbus_gaps *gaps;
+
+ size_offsets = n_memfds * sizeof(*gaps->memfd_offsets);
+ size_memfds = n_memfds * sizeof(*gaps->memfd_files);
+ size_fds = n_fds * sizeof(*gaps->fd_files);
+ size = sizeof(*gaps) + size_offsets + size_memfds + size_fds;
+
+ gaps = kzalloc(size, GFP_KERNEL);
+ if (!gaps)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&gaps->kref);
+ gaps->n_memfds = 0; /* we reserve n_memfds, but don't enforce them */
+ gaps->memfd_offsets = (void *)(gaps + 1);
+ gaps->memfd_files = (void *)((u8 *)gaps->memfd_offsets + size_offsets);
+ gaps->n_fds = 0; /* we reserve n_fds, but don't enforce them */
+ gaps->fd_files = (void *)((u8 *)gaps->memfd_files + size_memfds);
+
+ return gaps;
+}
+
+static void kdbus_gaps_free(struct kref *kref)
+{
+ struct kdbus_gaps *gaps = container_of(kref, struct kdbus_gaps, kref);
+ size_t i;
+
+ for (i = 0; i < gaps->n_fds; ++i)
+ if (gaps->fd_files[i])
+ fput(gaps->fd_files[i]);
+ for (i = 0; i < gaps->n_memfds; ++i)
+ if (gaps->memfd_files[i])
+ fput(gaps->memfd_files[i]);
+
+ kfree(gaps);
+}
+
+/**
+ * kdbus_gaps_ref() - gain reference
+ * @gaps: gaps object
+ *
+ * Return: @gaps is returned
+ */
+struct kdbus_gaps *kdbus_gaps_ref(struct kdbus_gaps *gaps)
+{
+ if (gaps)
+ kref_get(&gaps->kref);
+ return gaps;
+}
+
+/**
+ * kdbus_gaps_unref() - drop reference
+ * @gaps: gaps object
+ *
+ * Return: NULL
+ */
+struct kdbus_gaps *kdbus_gaps_unref(struct kdbus_gaps *gaps)
+{
+ if (gaps)
+ kref_put(&gaps->kref, kdbus_gaps_free);
+ return NULL;
+}
+
+/**
+ * kdbus_gaps_install() - install file-descriptors
+ * @gaps: gaps object, or NULL
+ * @slice: pool slice that contains the message
+ * @out_incomplete output variable to note incomplete fds
+ *
+ * This function installs all file-descriptors of @gaps into the current
+ * process and copies the file-descriptor numbers into the target pool slice.
+ *
+ * If the file-descriptors were only partially installed, then @out_incomplete
+ * will be set to true. Otherwise, it's set to false.
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int kdbus_gaps_install(struct kdbus_gaps *gaps, struct kdbus_pool_slice *slice,
+ bool *out_incomplete)
+{
+ bool incomplete_fds = false;
+ struct kvec kvec;
+ size_t i, n_fds;
+ int ret, *fds;
+
+ if (!gaps) {
+ /* nothing to do */
+ *out_incomplete = incomplete_fds;
+ return 0;
+ }
+
+ n_fds = gaps->n_fds + gaps->n_memfds;
+ if (n_fds < 1) {
+ /* nothing to do */
+ *out_incomplete = incomplete_fds;
+ return 0;
+ }
+
+ fds = kmalloc_array(n_fds, sizeof(*fds), GFP_TEMPORARY);
+ n_fds = 0;
+ if (!fds)
+ return -ENOMEM;
+
+ /* 1) allocate fds and copy them over */
+
+ if (gaps->n_fds > 0) {
+ for (i = 0; i < gaps->n_fds; ++i) {
+ int fd;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ incomplete_fds = true;
+
+ WARN_ON(!gaps->fd_files[i]);
+
+ fds[n_fds++] = fd < 0 ? -1 : fd;
+ }
+
+ /*
+ * The file-descriptor array can only be present once per
+ * message. Hence, prepare all fds and then copy them over with
+ * a single kvec.
+ */
+
+ WARN_ON(!gaps->fd_offset);
+
+ kvec.iov_base = fds;
+ kvec.iov_len = gaps->n_fds * sizeof(*fds);
+ ret = kdbus_pool_slice_copy_kvec(slice, gaps->fd_offset,
+ &kvec, 1, kvec.iov_len);
+ if (ret < 0)
+ goto exit;
+ }
+
+ for (i = 0; i < gaps->n_memfds; ++i) {
+ int memfd;
+
+ memfd = get_unused_fd_flags(O_CLOEXEC);
+ if (memfd < 0) {
+ incomplete_fds = true;
+ /* memfds are initialized to -1, skip copying it */
+ continue;
+ }
+
+ fds[n_fds++] = memfd;
+
+ /*
+ * memfds have to be copied individually as they each are put
+ * into a separate item. This should not be an issue, though,
+ * as usually there is no need to send more than one memfd per
+ * message.
+ */
+
+ WARN_ON(!gaps->memfd_offsets[i]);
+ WARN_ON(!gaps->memfd_files[i]);
+
+ kvec.iov_base = &memfd;
+ kvec.iov_len = sizeof(memfd);
+ ret = kdbus_pool_slice_copy_kvec(slice, gaps->memfd_offsets[i],
+ &kvec, 1, kvec.iov_len);
+ if (ret < 0)
+ goto exit;
+ }
+
+ /* 2) install fds now that everything was successful */
+
+ for (i = 0; i < gaps->n_fds; ++i)
+ if (fds[i] >= 0)
+ fd_install(fds[i], get_file(gaps->fd_files[i]));
+ for (i = 0; i < gaps->n_memfds; ++i)
+ if (fds[gaps->n_fds + i] >= 0)
+ fd_install(fds[gaps->n_fds + i],
+ get_file(gaps->memfd_files[i]));
+
+ ret = 0;
+
+exit:
+ if (ret < 0)
+ for (i = 0; i < n_fds; ++i)
+ put_unused_fd(fds[i]);
+ kfree(fds);
+ *out_incomplete = incomplete_fds;
+ return ret;
+}
+
+static struct file *kdbus_get_fd(int fd)
+{
+ struct file *f, *ret;
+ struct inode *inode;
+ struct socket *sock;
+
+ if (fd < 0)
+ return ERR_PTR(-EBADF);
+
+ f = fget_raw(fd);
+ if (!f)
+ return ERR_PTR(-EBADF);
+
+ inode = file_inode(f);
+ sock = S_ISSOCK(inode->i_mode) ? SOCKET_I(inode) : NULL;
+
+ if (f->f_mode & FMODE_PATH)
+ ret = f; /* O_PATH is always allowed */
+ else if (f->f_op == &kdbus_handle_ops)
+ ret = ERR_PTR(-EOPNOTSUPP); /* disallow kdbus-fd over kdbus */
+ else if (sock && sock->sk && sock->ops && sock->ops->family == PF_UNIX)
+ ret = ERR_PTR(-EOPNOTSUPP); /* disallow UDS over kdbus */
+ else
+ ret = f; /* all other are allowed */
+
+ if (f != ret)
+ fput(f);
+
+ return ret;
+}
+
+static struct file *kdbus_get_memfd(const struct kdbus_memfd *memfd)
+{
+ const int m = F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_SEAL;
+ struct file *f, *ret;
+ int s;
+
+ if (memfd->fd < 0)
+ return ERR_PTR(-EBADF);
+
+ f = fget(memfd->fd);
+ if (!f)
+ return ERR_PTR(-EBADF);
+
+ s = shmem_get_seals(f);
+ if (s < 0)
+ ret = ERR_PTR(-EMEDIUMTYPE);
+ else if ((s & m) != m)
+ ret = ERR_PTR(-ETXTBSY);
+ else if (memfd->start + memfd->size > (u64)i_size_read(file_inode(f)))
+ ret = ERR_PTR(-EFAULT);
+ else
+ ret = f;
+
+ if (f != ret)
+ fput(f);
+
+ return ret;
+}
+
+static int kdbus_msg_examine(struct kdbus_msg *msg, struct kdbus_bus *bus,
+ struct kdbus_cmd_send *cmd, size_t *out_n_memfds,
+ size_t *out_n_fds, size_t *out_n_parts)
+{
+ struct kdbus_item *item, *fds = NULL, *bloom = NULL, *dstname = NULL;
+ u64 n_parts, n_memfds, n_fds, vec_size;
+
+ /*
+ * Step 1:
+ * Validate the message and command parameters.
+ */
+
+ /* KDBUS_PAYLOAD_KERNEL is reserved to kernel messages */
+ if (msg->payload_type == KDBUS_PAYLOAD_KERNEL)
+ return -EINVAL;
+
+ if (msg->dst_id == KDBUS_DST_ID_BROADCAST) {
+ /* broadcasts must be marked as signals */
+ if (!(msg->flags & KDBUS_MSG_SIGNAL))
+ return -EBADMSG;
+ /* broadcasts cannot have timeouts */
+ if (msg->timeout_ns > 0)
+ return -ENOTUNIQ;
+ }
+
+ if (msg->flags & KDBUS_MSG_EXPECT_REPLY) {
+ /* if you expect a reply, you must specify a timeout */
+ if (msg->timeout_ns == 0)
+ return -EINVAL;
+ /* signals cannot have replies */
+ if (msg->flags & KDBUS_MSG_SIGNAL)
+ return -ENOTUNIQ;
+ } else {
+ /* must expect reply if sent as synchronous call */
+ if (cmd->flags & KDBUS_SEND_SYNC_REPLY)
+ return -EINVAL;
+ /* cannot mark replies as signal */
+ if (msg->cookie_reply && (msg->flags & KDBUS_MSG_SIGNAL))
+ return -EINVAL;
+ }
+
+ /*
+ * Step 2:
+ * Validate all passed items. While at it, select some statistics that
+ * are required to allocate state objects later on.
+ *
+ * Generic item validation has already been done via
+ * kdbus_item_validate(). Furthermore, the number of items is naturally
+ * limited by the maximum message size. Hence, only non-generic item
+ * checks are performed here (mainly integer overflow tests).
+ */
+
+ n_parts = 0;
+ n_memfds = 0;
+ n_fds = 0;
+ vec_size = 0;
+
+ KDBUS_ITEMS_FOREACH(item, msg->items, KDBUS_ITEMS_SIZE(msg, items)) {
+ switch (item->type) {
+ case KDBUS_ITEM_PAYLOAD_VEC: {
+ void __force __user *ptr = KDBUS_PTR(item->vec.address);
+ u64 size = item->vec.size;
+
+ if (vec_size + size < vec_size)
+ return -EMSGSIZE;
+ if (vec_size + size > KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE)
+ return -EMSGSIZE;
+ if (ptr && unlikely(!access_ok(VERIFY_READ, ptr, size)))
+ return -EFAULT;
+
+ if (ptr || size % 8) /* data or padding */
+ ++n_parts;
+ break;
+ }
+ case KDBUS_ITEM_PAYLOAD_MEMFD: {
+ u64 start = item->memfd.start;
+ u64 size = item->memfd.size;
+
+ if (start + size < start)
+ return -EMSGSIZE;
+ if (n_memfds >= KDBUS_MSG_MAX_MEMFD_ITEMS)
+ return -E2BIG;
+
+ ++n_memfds;
+ if (size % 8) /* vec-padding required */
+ ++n_parts;
+ break;
+ }
+ case KDBUS_ITEM_FDS: {
+ if (fds)
+ return -EEXIST;
+
+ fds = item;
+ n_fds = KDBUS_ITEM_PAYLOAD_SIZE(item) / sizeof(int);
+ if (n_fds > KDBUS_CONN_MAX_FDS_PER_USER)
+ return -EMFILE;
+
+ break;
+ }
+ case KDBUS_ITEM_BLOOM_FILTER: {
+ u64 bloom_size;
+
+ if (bloom)
+ return -EEXIST;
+
+ bloom = item;
+ bloom_size = KDBUS_ITEM_PAYLOAD_SIZE(item) -
+ offsetof(struct kdbus_bloom_filter, data);
+ if (!KDBUS_IS_ALIGNED8(bloom_size))
+ return -EFAULT;
+ if (bloom_size != bus->bloom.size)
+ return -EDOM;
+
+ break;
+ }
+ case KDBUS_ITEM_DST_NAME: {
+ if (dstname)
+ return -EEXIST;
+
+ dstname = item;
+ if (!kdbus_name_is_valid(item->str, false))
+ return -EINVAL;
+ if (msg->dst_id == KDBUS_DST_ID_BROADCAST)
+ return -EBADMSG;
+
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Step 3:
+ * Validate that required items were actually passed, and that no item
+ * contradicts the message flags.
+ */
+
+ /* bloom filters must be attached _iff_ it's a signal */
+ if (!(msg->flags & KDBUS_MSG_SIGNAL) != !bloom)
+ return -EBADMSG;
+ /* destination name is required if no ID is given */
+ if (msg->dst_id == KDBUS_DST_ID_NAME && !dstname)
+ return -EDESTADDRREQ;
+ /* cannot send file-descriptors attached to broadcasts */
+ if (msg->dst_id == KDBUS_DST_ID_BROADCAST && fds)
+ return -ENOTUNIQ;
+
+ *out_n_memfds = n_memfds;
+ *out_n_fds = n_fds;
+ *out_n_parts = n_parts;
+
+ return 0;
+}
+
+static bool kdbus_staging_merge_vecs(struct kdbus_staging *staging,
+ struct kdbus_item **prev_item,
+ struct iovec **prev_vec,
+ const struct kdbus_item *merge)
+{
+ void __user *ptr = (void __user *)KDBUS_PTR(merge->vec.address);
+ u64 padding = merge->vec.size % 8;
+ struct kdbus_item *prev = *prev_item;
+ struct iovec *vec = *prev_vec;
+
+ /* XXX: merging is disabled so far */
+ if (0 && prev && prev->type == KDBUS_ITEM_PAYLOAD_OFF &&
+ !merge->vec.address == !prev->vec.address) {
+ /*
+ * If we merge two VECs, we can always drop the second
+ * PAYLOAD_VEC item. Hence, include its size in the previous
+ * one.
+ */
+ prev->vec.size += merge->vec.size;
+
+ if (ptr) {
+ /*
+ * If we merge two data VECs, we need two iovecs to copy
+ * the data. But the items can be easily merged by
+ * summing their lengths.
+ */
+ vec = &staging->parts[staging->n_parts++];
+ vec->iov_len = merge->vec.size;
+ vec->iov_base = ptr;
+ staging->n_payload += vec->iov_len;
+ } else if (padding) {
+ /*
+ * If we merge two 0-vecs with the second 0-vec
+ * requiring padding, we need to insert an iovec to copy
+ * the 0-padding. We try merging it with the previous
+ * 0-padding iovec. This might end up with an
+ * iov_len==0, in which case we simply drop the iovec.
+ */
+ if (vec) {
+ staging->n_payload -= vec->iov_len;
+ vec->iov_len = prev->vec.size % 8;
+ if (!vec->iov_len) {
+ --staging->n_parts;
+ vec = NULL;
+ } else {
+ staging->n_payload += vec->iov_len;
+ }
+ } else {
+ vec = &staging->parts[staging->n_parts++];
+ vec->iov_len = padding;
+ vec->iov_base = (char __user *)zeros;
+ staging->n_payload += vec->iov_len;
+ }
+ } else {
+ /*
+ * If we merge two 0-vecs with the second 0-vec having
+ * no padding, we know the padding of the first stays
+ * the same. Hence, @vec needs no adjustment.
+ */
+ }
+
+ /* successfully merged with previous item */
+ merge = prev;
+ } else {
+ /*
+ * If we cannot merge the payload item with the previous one,
+ * we simply insert a new iovec for the data/padding.
+ */
+ if (ptr) {
+ vec = &staging->parts[staging->n_parts++];
+ vec->iov_len = merge->vec.size;
+ vec->iov_base = ptr;
+ staging->n_payload += vec->iov_len;
+ } else if (padding) {
+ vec = &staging->parts[staging->n_parts++];
+ vec->iov_len = padding;
+ vec->iov_base = (char __user *)zeros;
+ staging->n_payload += vec->iov_len;
+ } else {
+ vec = NULL;
+ }
+ }
+
+ *prev_item = (struct kdbus_item *)merge;
+ *prev_vec = vec;
+
+ return merge == prev;
+}
+
+static int kdbus_staging_import(struct kdbus_staging *staging)
+{
+ struct kdbus_item *it, *item, *last, *prev_payload;
+ struct kdbus_gaps *gaps = staging->gaps;
+ struct kdbus_msg *msg = staging->msg;
+ struct iovec *part, *prev_part;
+ bool drop_item;
+
+ drop_item = false;
+ last = NULL;
+ prev_payload = NULL;
+ prev_part = NULL;
+
+ /*
+ * We modify msg->items along the way; make sure to use @item as offset
+ * to the next item (instead of the iterator @it).
+ */
+ for (it = item = msg->items;
+ it >= msg->items &&
+ (u8 *)it < (u8 *)msg + msg->size &&
+ (u8 *)it + it->size <= (u8 *)msg + msg->size; ) {
+ /*
+ * If we dropped items along the way, move current item to
+ * front. We must not access @it afterwards, but use @item
+ * instead!
+ */
+ if (it != item)
+ memmove(item, it, it->size);
+ it = (void *)((u8 *)it + KDBUS_ALIGN8(item->size));
+
+ switch (item->type) {
+ case KDBUS_ITEM_PAYLOAD_VEC: {
+ size_t offset = staging->n_payload;
+
+ if (kdbus_staging_merge_vecs(staging, &prev_payload,
+ &prev_part, item)) {
+ drop_item = true;
+ } else if (item->vec.address) {
+ /* real offset is patched later on */
+ item->type = KDBUS_ITEM_PAYLOAD_OFF;
+ item->vec.offset = offset;
+ } else {
+ item->type = KDBUS_ITEM_PAYLOAD_OFF;
+ item->vec.offset = ~0ULL;
+ }
+
+ break;
+ }
+ case KDBUS_ITEM_PAYLOAD_MEMFD: {
+ struct file *f;
+
+ f = kdbus_get_memfd(&item->memfd);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+
+ gaps->memfd_files[gaps->n_memfds] = f;
+ gaps->memfd_offsets[gaps->n_memfds] =
+ (u8 *)&item->memfd.fd - (u8 *)msg;
+ ++gaps->n_memfds;
+
+ /* memfds cannot be merged */
+ prev_payload = item;
+ prev_part = NULL;
+
+ /* insert padding to make following VECs aligned */
+ if (item->memfd.size % 8) {
+ part = &staging->parts[staging->n_parts++];
+ part->iov_len = item->memfd.size % 8;
+ part->iov_base = (char __user *)zeros;
+ staging->n_payload += part->iov_len;
+ }
+
+ break;
+ }
+ case KDBUS_ITEM_FDS: {
+ size_t i, n_fds;
+
+ n_fds = KDBUS_ITEM_PAYLOAD_SIZE(item) / sizeof(int);
+ for (i = 0; i < n_fds; ++i) {
+ struct file *f;
+
+ f = kdbus_get_fd(item->fds[i]);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+
+ gaps->fd_files[gaps->n_fds++] = f;
+ }
+
+ gaps->fd_offset = (u8 *)item->fds - (u8 *)msg;
+
+ break;
+ }
+ case KDBUS_ITEM_BLOOM_FILTER:
+ staging->bloom_filter = &item->bloom_filter;
+ break;
+ case KDBUS_ITEM_DST_NAME:
+ staging->dst_name = item->str;
+ break;
+ }
+
+ /* drop item if we merged it with a previous one */
+ if (drop_item) {
+ drop_item = false;
+ } else {
+ last = item;
+ item = KDBUS_ITEM_NEXT(item);
+ }
+ }
+
+ /* adjust message size regarding dropped items */
+ msg->size = offsetof(struct kdbus_msg, items);
+ if (last)
+ msg->size += ((u8 *)last - (u8 *)msg->items) + last->size;
+
+ return 0;
+}
+
+static void kdbus_staging_reserve(struct kdbus_staging *staging)
+{
+ struct iovec *part;
+
+ part = &staging->parts[staging->n_parts++];
+ part->iov_base = (void __user *)zeros;
+ part->iov_len = 0;
+}
+
+static struct kdbus_staging *kdbus_staging_new(struct kdbus_bus *bus,
+ size_t n_parts,
+ size_t msg_extra_size)
+{
+ const size_t reserved_parts = 5; /* see below for explanation */
+ struct kdbus_staging *staging;
+ int ret;
+
+ n_parts += reserved_parts;
+
+ staging = kzalloc(sizeof(*staging) + n_parts * sizeof(*staging->parts) +
+ msg_extra_size, GFP_TEMPORARY);
+ if (!staging)
+ return ERR_PTR(-ENOMEM);
+
+ staging->msg_seqnum = atomic64_inc_return(&bus->domain->last_id);
+ staging->n_parts = 0; /* we reserve n_parts, but don't enforce them */
+ staging->parts = (void *)(staging + 1);
+
+ if (msg_extra_size) /* if requested, allocate message, too */
+ staging->msg = (void *)((u8 *)staging->parts +
+ n_parts * sizeof(*staging->parts));
+
+ staging->meta_proc = kdbus_meta_proc_new();
+ if (IS_ERR(staging->meta_proc)) {
+ ret = PTR_ERR(staging->meta_proc);
+ staging->meta_proc = NULL;
+ goto error;
+ }
+
+ staging->meta_conn = kdbus_meta_conn_new();
+ if (IS_ERR(staging->meta_conn)) {
+ ret = PTR_ERR(staging->meta_conn);
+ staging->meta_conn = NULL;
+ goto error;
+ }
+
+ /*
+ * Prepare iovecs to copy the message into the target pool. We use the
+ * following iovecs:
+ * * iovec to copy "kdbus_msg.size"
+ * * iovec to copy "struct kdbus_msg" (minus size) plus items
+ * * iovec for possible padding after the items
+ * * iovec for metadata items
+ * * iovec for possible padding after the items
+ *
+ * Make sure to update @reserved_parts if you add more parts here.
+ */
+
+ kdbus_staging_reserve(staging); /* msg.size */
+ kdbus_staging_reserve(staging); /* msg (minus msg.size) plus items */
+ kdbus_staging_reserve(staging); /* msg padding */
+ kdbus_staging_reserve(staging); /* meta */
+ kdbus_staging_reserve(staging); /* meta padding */
+
+ return staging;
+
+error:
+ kdbus_staging_free(staging);
+ return ERR_PTR(ret);
+}
+
+struct kdbus_staging *kdbus_staging_new_kernel(struct kdbus_bus *bus,
+ u64 dst, u64 cookie_timeout,
+ size_t it_size, size_t it_type)
+{
+ struct kdbus_staging *staging;
+ size_t size;
+
+ size = offsetof(struct kdbus_msg, items) +
+ KDBUS_ITEM_HEADER_SIZE + it_size;
+
+ staging = kdbus_staging_new(bus, 0, KDBUS_ALIGN8(size));
+ if (IS_ERR(staging))
+ return ERR_CAST(staging);
+
+ staging->msg->size = size;
+ staging->msg->flags = (dst == KDBUS_DST_ID_BROADCAST) ?
+ KDBUS_MSG_SIGNAL : 0;
+ staging->msg->dst_id = dst;
+ staging->msg->src_id = KDBUS_SRC_ID_KERNEL;
+ staging->msg->payload_type = KDBUS_PAYLOAD_KERNEL;
+ staging->msg->cookie_reply = cookie_timeout;
+ staging->notify = staging->msg->items;
+ staging->notify->size = KDBUS_ITEM_HEADER_SIZE + it_size;
+ staging->notify->type = it_type;
+
+ return staging;
+}
+
+struct kdbus_staging *kdbus_staging_new_user(struct kdbus_bus *bus,
+ struct kdbus_cmd_send *cmd,
+ struct kdbus_msg *msg)
+{
+ const size_t reserved_parts = 1; /* see below for explanation */
+ size_t n_memfds, n_fds, n_parts;
+ struct kdbus_staging *staging;
+ int ret;
+
+ /*
+ * Examine user-supplied message and figure out how many resources we
+ * need to allocate in our staging area. This requires us to iterate
+ * the message twice, but saves us from re-allocating our resources
+ * all the time.
+ */
+
+ ret = kdbus_msg_examine(msg, bus, cmd, &n_memfds, &n_fds, &n_parts);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ n_parts += reserved_parts;
+
+ /*
+ * Allocate staging area with the number of required resources. Make
+ * sure that we have enough iovecs for all required parts pre-allocated
+ * so this will hopefully be the only memory allocation for this
+ * message transaction.
+ */
+
+ staging = kdbus_staging_new(bus, n_parts, 0);
+ if (IS_ERR(staging))
+ return ERR_CAST(staging);
+
+ staging->msg = msg;
+
+ /*
+ * If the message contains memfds or fd items, we need to remember some
+ * state so we can fill in the requested information at RECV time.
+ * File-descriptors cannot be passed at SEND time. Hence, allocate a
+ * gaps-object to remember that state. That gaps object is linked to
+ * from the staging area, but will also be linked to from the message
+ * queue of each peer. Hence, each receiver owns a reference to it, and
+ * it will later be used to fill the 'gaps' in message that couldn't be
+ * filled at SEND time.
+ * Note that the 'gaps' object is read-only once the staging-allocator
+ * returns. There might be connections receiving a queued message while
+ * the sender still broadcasts the message to other receivers.
+ */
+
+ if (n_memfds > 0 || n_fds > 0) {
+ staging->gaps = kdbus_gaps_new(n_memfds, n_fds);
+ if (IS_ERR(staging->gaps)) {
+ ret = PTR_ERR(staging->gaps);
+ staging->gaps = NULL;
+ kdbus_staging_free(staging);
+ return ERR_PTR(ret);
+ }
+ }
+
+ /*
+ * kdbus_staging_new() already reserves parts for message setup. For
+ * user-supplied messages, we add the following iovecs:
+ * ... variable number of iovecs for payload ...
+ * * final iovec for possible padding of payload
+ *
+ * Make sure to update @reserved_parts if you add more parts here.
+ */
+
+ ret = kdbus_staging_import(staging); /* payload */
+ kdbus_staging_reserve(staging); /* payload padding */
+
+ if (ret < 0)
+ goto error;
+
+ return staging;
+
+error:
+ kdbus_staging_free(staging);
+ return ERR_PTR(ret);
+}
+
+struct kdbus_staging *kdbus_staging_free(struct kdbus_staging *staging)
+{
+ if (!staging)
+ return NULL;
+
+ kdbus_meta_conn_unref(staging->meta_conn);
+ kdbus_meta_proc_unref(staging->meta_proc);
+ kdbus_gaps_unref(staging->gaps);
+ kfree(staging);
+
+ return NULL;
+}
+
+static int kdbus_staging_collect_metadata(struct kdbus_staging *staging,
+ struct kdbus_conn *src,
+ struct kdbus_conn *dst,
+ u64 *out_attach)
+{
+ u64 attach;
+ int ret;
+
+ if (src)
+ attach = kdbus_meta_msg_mask(src, dst);
+ else
+ attach = KDBUS_ATTACH_TIMESTAMP; /* metadata for kernel msgs */
+
+ if (src && !src->meta_fake) {
+ ret = kdbus_meta_proc_collect(staging->meta_proc, attach);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = kdbus_meta_conn_collect(staging->meta_conn, src,
+ staging->msg_seqnum, attach);
+ if (ret < 0)
+ return ret;
+
+ *out_attach = attach;
+ return 0;
+}
+
+/**
+ * kdbus_staging_emit() - emit linearized message in target pool
+ * @staging: staging object to create message from
+ * @src: sender of the message (or NULL)
+ * @dst: target connection to allocate message for
+ *
+ * This allocates a pool-slice for @dst and copies the message provided by
+ * @staging into it. The new slice is then returned to the caller for further
+ * processing. It's not linked into any queue, yet.
+ *
+ * Return: Newly allocated slice or ERR_PTR on failure.
+ */
+struct kdbus_pool_slice *kdbus_staging_emit(struct kdbus_staging *staging,
+ struct kdbus_conn *src,
+ struct kdbus_conn *dst)
+{
+ struct kdbus_item *item, *meta_items = NULL;
+ struct kdbus_pool_slice *slice = NULL;
+ size_t off, size, meta_size;
+ struct iovec *v;
+ u64 attach, msg_size;
+ int ret;
+
+ /*
+ * Step 1:
+ * Collect metadata from @src depending on the attach-flags allowed for
+ * @dst. Translate it into the namespaces pinned by @dst.
+ */
+
+ ret = kdbus_staging_collect_metadata(staging, src, dst, &attach);
+ if (ret < 0)
+ goto error;
+
+ ret = kdbus_meta_emit(staging->meta_proc, NULL, staging->meta_conn,
+ dst, attach, &meta_items, &meta_size);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * Step 2:
+ * Setup iovecs for the message. See kdbus_staging_new() for allocation
+ * of those iovecs. All reserved iovecs have been initialized with
+ * iov_len=0 + iov_base=zeros. Furthermore, the iovecs to copy the
+ * actual message payload have already been initialized and need not be
+ * touched.
+ */
+
+ v = staging->parts;
+ msg_size = staging->msg->size;
+
+ /* msg.size */
+ v->iov_len = sizeof(msg_size);
+ v->iov_base = (void __user *)&msg_size;
+ ++v;
+
+ /* msg (after msg.size) plus items */
+ v->iov_len = staging->msg->size - sizeof(staging->msg->size);
+ v->iov_base = (void __user *)((u8 *)staging->msg +
+ sizeof(staging->msg->size));
+ ++v;
+
+ /* padding after msg */
+ v->iov_len = KDBUS_ALIGN8(staging->msg->size) - staging->msg->size;
+ v->iov_base = (void __user *)zeros;
+ ++v;
+
+ if (meta_size > 0) {
+ /* metadata items */
+ v->iov_len = meta_size;
+ v->iov_base = (void __user *)meta_items;
+ ++v;
+
+ /* padding after metadata */
+ v->iov_len = KDBUS_ALIGN8(meta_size) - meta_size;
+ v->iov_base = (void __user *)zeros;
+ ++v;
+
+ msg_size = KDBUS_ALIGN8(msg_size) + meta_size;
+ } else {
+ /* metadata items */
+ v->iov_len = 0;
+ v->iov_base = (void __user *)zeros;
+ ++v;
+
+ /* padding after metadata */
+ v->iov_len = 0;
+ v->iov_base = (void __user *)zeros;
+ ++v;
+ }
+
+ /* ... payload iovecs are already filled in ... */
+
+ /* compute overall size and fill in padding after payload */
+ size = KDBUS_ALIGN8(msg_size);
+
+ if (staging->n_payload > 0) {
+ size += staging->n_payload;
+
+ v = &staging->parts[staging->n_parts - 1];
+ v->iov_len = KDBUS_ALIGN8(size) - size;
+ v->iov_base = (void __user *)zeros;
+
+ size = KDBUS_ALIGN8(size);
+ }
+
+ /*
+ * Step 3:
+ * The PAYLOAD_OFF items in the message contain a relative 'offset'
+ * field that tells the receiver where to find the actual payload. This
+ * offset is relative to the start of the message, and as such depends
+ * on the size of the metadata items we inserted. This size is variable
+ * and changes for each peer we send the message to. Hence, we remember
+ * the last relative offset that was used to calculate the 'offset'
+ * fields. For each message, we re-calculate it and patch all items, in
+ * case it changed.
+ */
+
+ off = KDBUS_ALIGN8(msg_size);
+
+ if (off != staging->i_payload) {
+ KDBUS_ITEMS_FOREACH(item, staging->msg->items,
+ KDBUS_ITEMS_SIZE(staging->msg, items)) {
+ if (item->type != KDBUS_ITEM_PAYLOAD_OFF)
+ continue;
+
+ item->vec.offset -= staging->i_payload;
+ item->vec.offset += off;
+ }
+
+ staging->i_payload = off;
+ }
+
+ /*
+ * Step 4:
+ * Allocate pool slice and copy over all data. Make sure to properly
+ * account on user quota.
+ */
+
+ ret = kdbus_conn_quota_inc(dst, src ? src->user : NULL, size,
+ staging->gaps ? staging->gaps->n_fds : 0);
+ if (ret < 0)
+ goto error;
+
+ slice = kdbus_pool_slice_alloc(dst->pool, size, true);
+ if (IS_ERR(slice)) {
+ ret = PTR_ERR(slice);
+ slice = NULL;
+ goto error;
+ }
+
+ WARN_ON(kdbus_pool_slice_size(slice) != size);
+
+ ret = kdbus_pool_slice_copy_iovec(slice, 0, staging->parts,
+ staging->n_parts, size);
+ if (ret < 0)
+ goto error;
+
+ /* all done, return slice to caller */
+ goto exit;
+
+error:
+ if (slice)
+ kdbus_conn_quota_dec(dst, src ? src->user : NULL, size,
+ staging->gaps ? staging->gaps->n_fds : 0);
+ kdbus_pool_slice_release(slice);
+ slice = ERR_PTR(ret);
+exit:
+ kfree(meta_items);
+ return slice;
+}
diff --git a/ipc/kdbus/message.h b/ipc/kdbus/message.h
new file mode 100644
index 000000000..298f9c99d
--- /dev/null
+++ b/ipc/kdbus/message.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_MESSAGE_H
+#define __KDBUS_MESSAGE_H
+
+#include <linux/fs.h>
+#include <linux/kref.h>
+#include <uapi/linux/kdbus.h>
+
+struct kdbus_bus;
+struct kdbus_conn;
+struct kdbus_meta_conn;
+struct kdbus_meta_proc;
+struct kdbus_pool_slice;
+
+/**
+ * struct kdbus_gaps - gaps in message to be filled later
+ * @kref: Reference counter
+ * @n_memfd_offs: Number of memfds
+ * @memfd_offs: Offsets of kdbus_memfd items in target slice
+ * @n_fds: Number of fds
+ * @fds: Array of sent fds
+ * @fds_offset: Offset of fd-array in target slice
+ *
+ * The 'gaps' object is used to track data that is needed to fill gaps in a
+ * message at RECV time. Usually, we try to compile the whole message at SEND
+ * time. This has the advantage, that we don't have to cache any information and
+ * can keep the memory consumption small. Furthermore, all copy operations can
+ * be combined into a single function call, which speeds up transactions
+ * considerably.
+ * However, things like file-descriptors can only be fully installed at RECV
+ * time. The gaps object tracks this data and pins it until a message is
+ * received. The gaps object is shared between all receivers of the same
+ * message.
+ */
+struct kdbus_gaps {
+ struct kref kref;
+
+ /* state tracking for KDBUS_ITEM_PAYLOAD_MEMFD entries */
+ size_t n_memfds;
+ u64 *memfd_offsets;
+ struct file **memfd_files;
+
+ /* state tracking for KDBUS_ITEM_FDS */
+ size_t n_fds;
+ struct file **fd_files;
+ u64 fd_offset;
+};
+
+struct kdbus_gaps *kdbus_gaps_ref(struct kdbus_gaps *gaps);
+struct kdbus_gaps *kdbus_gaps_unref(struct kdbus_gaps *gaps);
+int kdbus_gaps_install(struct kdbus_gaps *gaps, struct kdbus_pool_slice *slice,
+ bool *out_incomplete);
+
+/**
+ * struct kdbus_staging - staging area to import messages
+ * @msg: User-supplied message
+ * @gaps: Gaps-object created during import (or NULL if empty)
+ * @msg_seqnum: Message sequence number
+ * @notify_entry: Entry into list of kernel-generated notifications
+ * @i_payload: Current relative index of start of payload
+ * @n_payload: Total number of bytes needed for payload
+ * @n_parts: Number of parts
+ * @parts: Array of iovecs that make up the whole message
+ * @meta_proc: Process metadata of the sender (or NULL if empty)
+ * @meta_conn: Connection metadata of the sender (or NULL if empty)
+ * @bloom_filter: Pointer to the bloom-item in @msg, or NULL
+ * @dst_name: Pointer to the dst-name-item in @msg, or NULL
+ * @notify: Pointer to the notification item in @msg, or NULL
+ *
+ * The kdbus_staging object is a temporary staging area to import user-supplied
+ * messages into the kernel. It is only used during SEND and dropped once the
+ * message is queued. Any data that cannot be collected during SEND, is
+ * collected in a kdbus_gaps object and attached to the message queue.
+ */
+struct kdbus_staging {
+ struct kdbus_msg *msg;
+ struct kdbus_gaps *gaps;
+ u64 msg_seqnum;
+ struct list_head notify_entry;
+
+ /* crafted iovecs to copy the message */
+ size_t i_payload;
+ size_t n_payload;
+ size_t n_parts;
+ struct iovec *parts;
+
+ /* metadata state */
+ struct kdbus_meta_proc *meta_proc;
+ struct kdbus_meta_conn *meta_conn;
+
+ /* cached pointers into @msg */
+ const struct kdbus_bloom_filter *bloom_filter;
+ const char *dst_name;
+ struct kdbus_item *notify;
+};
+
+struct kdbus_staging *kdbus_staging_new_kernel(struct kdbus_bus *bus,
+ u64 dst, u64 cookie_timeout,
+ size_t it_size, size_t it_type);
+struct kdbus_staging *kdbus_staging_new_user(struct kdbus_bus *bus,
+ struct kdbus_cmd_send *cmd,
+ struct kdbus_msg *msg);
+struct kdbus_staging *kdbus_staging_free(struct kdbus_staging *staging);
+struct kdbus_pool_slice *kdbus_staging_emit(struct kdbus_staging *staging,
+ struct kdbus_conn *src,
+ struct kdbus_conn *dst);
+
+#endif
diff --git a/ipc/kdbus/metadata.c b/ipc/kdbus/metadata.c
new file mode 100644
index 000000000..d4973a90a
--- /dev/null
+++ b/ipc/kdbus/metadata.c
@@ -0,0 +1,1342 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fs_struct.h>
+#include <linux/init.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/security.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uidgid.h>
+#include <linux/uio.h>
+#include <linux/user_namespace.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "item.h"
+#include "message.h"
+#include "metadata.h"
+#include "names.h"
+
+/**
+ * struct kdbus_meta_proc - Process metadata
+ * @kref: Reference counting
+ * @lock: Object lock
+ * @collected: Bitmask of collected items
+ * @valid: Bitmask of collected and valid items
+ * @cred: Credentials
+ * @pid: PID of process
+ * @tgid: TGID of process
+ * @ppid: PPID of process
+ * @tid_comm: TID comm line
+ * @pid_comm: PID comm line
+ * @exe_path: Executable path
+ * @root_path: Root-FS path
+ * @cmdline: Command-line
+ * @cgroup: Full cgroup path
+ * @seclabel: Seclabel
+ * @audit_loginuid: Audit login-UID
+ * @audit_sessionid: Audit session-ID
+ */
+struct kdbus_meta_proc {
+ struct kref kref;
+ struct mutex lock;
+ u64 collected;
+ u64 valid;
+
+ /* KDBUS_ITEM_CREDS */
+ /* KDBUS_ITEM_AUXGROUPS */
+ /* KDBUS_ITEM_CAPS */
+ const struct cred *cred;
+
+ /* KDBUS_ITEM_PIDS */
+ struct pid *pid;
+ struct pid *tgid;
+ struct pid *ppid;
+
+ /* KDBUS_ITEM_TID_COMM */
+ char tid_comm[TASK_COMM_LEN];
+ /* KDBUS_ITEM_PID_COMM */
+ char pid_comm[TASK_COMM_LEN];
+
+ /* KDBUS_ITEM_EXE */
+ struct path exe_path;
+ struct path root_path;
+
+ /* KDBUS_ITEM_CMDLINE */
+ char *cmdline;
+
+ /* KDBUS_ITEM_CGROUP */
+ char *cgroup;
+
+ /* KDBUS_ITEM_SECLABEL */
+ char *seclabel;
+
+ /* KDBUS_ITEM_AUDIT */
+ kuid_t audit_loginuid;
+ unsigned int audit_sessionid;
+};
+
+/**
+ * struct kdbus_meta_conn
+ * @kref: Reference counting
+ * @lock: Object lock
+ * @collected: Bitmask of collected items
+ * @valid: Bitmask of collected and valid items
+ * @ts: Timestamp values
+ * @owned_names_items: Serialized items for owned names
+ * @owned_names_size: Size of @owned_names_items
+ * @conn_description: Connection description
+ */
+struct kdbus_meta_conn {
+ struct kref kref;
+ struct mutex lock;
+ u64 collected;
+ u64 valid;
+
+ /* KDBUS_ITEM_TIMESTAMP */
+ struct kdbus_timestamp ts;
+
+ /* KDBUS_ITEM_OWNED_NAME */
+ struct kdbus_item *owned_names_items;
+ size_t owned_names_size;
+
+ /* KDBUS_ITEM_CONN_DESCRIPTION */
+ char *conn_description;
+};
+
+/* fixed size equivalent of "kdbus_caps" */
+struct kdbus_meta_caps {
+ u32 last_cap;
+ struct {
+ u32 caps[_KERNEL_CAPABILITY_U32S];
+ } set[4];
+};
+
+/**
+ * kdbus_meta_proc_new() - Create process metadata object
+ *
+ * Return: Pointer to new object on success, ERR_PTR on failure.
+ */
+struct kdbus_meta_proc *kdbus_meta_proc_new(void)
+{
+ struct kdbus_meta_proc *mp;
+
+ mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+ if (!mp)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&mp->kref);
+ mutex_init(&mp->lock);
+
+ return mp;
+}
+
+static void kdbus_meta_proc_free(struct kref *kref)
+{
+ struct kdbus_meta_proc *mp = container_of(kref, struct kdbus_meta_proc,
+ kref);
+
+ path_put(&mp->exe_path);
+ path_put(&mp->root_path);
+ if (mp->cred)
+ put_cred(mp->cred);
+ put_pid(mp->ppid);
+ put_pid(mp->tgid);
+ put_pid(mp->pid);
+
+ kfree(mp->seclabel);
+ kfree(mp->cmdline);
+ kfree(mp->cgroup);
+ kfree(mp);
+}
+
+/**
+ * kdbus_meta_proc_ref() - Gain reference
+ * @mp: Process metadata object
+ *
+ * Return: @mp is returned
+ */
+struct kdbus_meta_proc *kdbus_meta_proc_ref(struct kdbus_meta_proc *mp)
+{
+ if (mp)
+ kref_get(&mp->kref);
+ return mp;
+}
+
+/**
+ * kdbus_meta_proc_unref() - Drop reference
+ * @mp: Process metadata object
+ *
+ * Return: NULL
+ */
+struct kdbus_meta_proc *kdbus_meta_proc_unref(struct kdbus_meta_proc *mp)
+{
+ if (mp)
+ kref_put(&mp->kref, kdbus_meta_proc_free);
+ return NULL;
+}
+
+static void kdbus_meta_proc_collect_pids(struct kdbus_meta_proc *mp)
+{
+ struct task_struct *parent;
+
+ mp->pid = get_pid(task_pid(current));
+ mp->tgid = get_pid(task_tgid(current));
+
+ rcu_read_lock();
+ parent = rcu_dereference(current->real_parent);
+ mp->ppid = get_pid(task_tgid(parent));
+ rcu_read_unlock();
+
+ mp->valid |= KDBUS_ATTACH_PIDS;
+}
+
+static void kdbus_meta_proc_collect_tid_comm(struct kdbus_meta_proc *mp)
+{
+ get_task_comm(mp->tid_comm, current);
+ mp->valid |= KDBUS_ATTACH_TID_COMM;
+}
+
+static void kdbus_meta_proc_collect_pid_comm(struct kdbus_meta_proc *mp)
+{
+ get_task_comm(mp->pid_comm, current->group_leader);
+ mp->valid |= KDBUS_ATTACH_PID_COMM;
+}
+
+static void kdbus_meta_proc_collect_exe(struct kdbus_meta_proc *mp)
+{
+ struct file *exe_file;
+
+ rcu_read_lock();
+ exe_file = rcu_dereference(current->mm->exe_file);
+ if (exe_file) {
+ mp->exe_path = exe_file->f_path;
+ path_get(&mp->exe_path);
+ get_fs_root(current->fs, &mp->root_path);
+ mp->valid |= KDBUS_ATTACH_EXE;
+ }
+ rcu_read_unlock();
+}
+
+static int kdbus_meta_proc_collect_cmdline(struct kdbus_meta_proc *mp)
+{
+ struct mm_struct *mm = current->mm;
+ char *cmdline;
+
+ if (!mm->arg_end)
+ return 0;
+
+ cmdline = strndup_user((const char __user *)mm->arg_start,
+ mm->arg_end - mm->arg_start);
+ if (IS_ERR(cmdline))
+ return PTR_ERR(cmdline);
+
+ mp->cmdline = cmdline;
+ mp->valid |= KDBUS_ATTACH_CMDLINE;
+
+ return 0;
+}
+
+static int kdbus_meta_proc_collect_cgroup(struct kdbus_meta_proc *mp)
+{
+#ifdef CONFIG_CGROUPS
+ void *page;
+ char *s;
+
+ page = (void *)__get_free_page(GFP_TEMPORARY);
+ if (!page)
+ return -ENOMEM;
+
+ s = task_cgroup_path(current, page, PAGE_SIZE);
+ if (s) {
+ mp->cgroup = kstrdup(s, GFP_KERNEL);
+ if (!mp->cgroup) {
+ free_page((unsigned long)page);
+ return -ENOMEM;
+ }
+ }
+
+ free_page((unsigned long)page);
+ mp->valid |= KDBUS_ATTACH_CGROUP;
+#endif
+
+ return 0;
+}
+
+static int kdbus_meta_proc_collect_seclabel(struct kdbus_meta_proc *mp)
+{
+#ifdef CONFIG_SECURITY
+ char *ctx = NULL;
+ u32 sid, len;
+ int ret;
+
+ security_task_getsecid(current, &sid);
+ ret = security_secid_to_secctx(sid, &ctx, &len);
+ if (ret < 0) {
+ /*
+ * EOPNOTSUPP means no security module is active,
+ * lets skip adding the seclabel then. This effectively
+ * drops the SECLABEL item.
+ */
+ return (ret == -EOPNOTSUPP) ? 0 : ret;
+ }
+
+ mp->seclabel = kstrdup(ctx, GFP_KERNEL);
+ security_release_secctx(ctx, len);
+ if (!mp->seclabel)
+ return -ENOMEM;
+
+ mp->valid |= KDBUS_ATTACH_SECLABEL;
+#endif
+
+ return 0;
+}
+
+static void kdbus_meta_proc_collect_audit(struct kdbus_meta_proc *mp)
+{
+#ifdef CONFIG_AUDITSYSCALL
+ mp->audit_loginuid = audit_get_loginuid(current);
+ mp->audit_sessionid = audit_get_sessionid(current);
+ mp->valid |= KDBUS_ATTACH_AUDIT;
+#endif
+}
+
+/**
+ * kdbus_meta_proc_collect() - Collect process metadata
+ * @mp: Process metadata object
+ * @what: Attach flags to collect
+ *
+ * This collects process metadata from current and saves it in @mp.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_meta_proc_collect(struct kdbus_meta_proc *mp, u64 what)
+{
+ int ret;
+
+ if (!mp || !(what & (KDBUS_ATTACH_CREDS |
+ KDBUS_ATTACH_PIDS |
+ KDBUS_ATTACH_AUXGROUPS |
+ KDBUS_ATTACH_TID_COMM |
+ KDBUS_ATTACH_PID_COMM |
+ KDBUS_ATTACH_EXE |
+ KDBUS_ATTACH_CMDLINE |
+ KDBUS_ATTACH_CGROUP |
+ KDBUS_ATTACH_CAPS |
+ KDBUS_ATTACH_SECLABEL |
+ KDBUS_ATTACH_AUDIT)))
+ return 0;
+
+ mutex_lock(&mp->lock);
+
+ /* creds, auxgrps and caps share "struct cred" as context */
+ {
+ const u64 m_cred = KDBUS_ATTACH_CREDS |
+ KDBUS_ATTACH_AUXGROUPS |
+ KDBUS_ATTACH_CAPS;
+
+ if ((what & m_cred) && !(mp->collected & m_cred)) {
+ mp->cred = get_current_cred();
+ mp->valid |= m_cred;
+ mp->collected |= m_cred;
+ }
+ }
+
+ if ((what & KDBUS_ATTACH_PIDS) &&
+ !(mp->collected & KDBUS_ATTACH_PIDS)) {
+ kdbus_meta_proc_collect_pids(mp);
+ mp->collected |= KDBUS_ATTACH_PIDS;
+ }
+
+ if ((what & KDBUS_ATTACH_TID_COMM) &&
+ !(mp->collected & KDBUS_ATTACH_TID_COMM)) {
+ kdbus_meta_proc_collect_tid_comm(mp);
+ mp->collected |= KDBUS_ATTACH_TID_COMM;
+ }
+
+ if ((what & KDBUS_ATTACH_PID_COMM) &&
+ !(mp->collected & KDBUS_ATTACH_PID_COMM)) {
+ kdbus_meta_proc_collect_pid_comm(mp);
+ mp->collected |= KDBUS_ATTACH_PID_COMM;
+ }
+
+ if ((what & KDBUS_ATTACH_EXE) &&
+ !(mp->collected & KDBUS_ATTACH_EXE)) {
+ kdbus_meta_proc_collect_exe(mp);
+ mp->collected |= KDBUS_ATTACH_EXE;
+ }
+
+ if ((what & KDBUS_ATTACH_CMDLINE) &&
+ !(mp->collected & KDBUS_ATTACH_CMDLINE)) {
+ ret = kdbus_meta_proc_collect_cmdline(mp);
+ if (ret < 0)
+ goto exit_unlock;
+ mp->collected |= KDBUS_ATTACH_CMDLINE;
+ }
+
+ if ((what & KDBUS_ATTACH_CGROUP) &&
+ !(mp->collected & KDBUS_ATTACH_CGROUP)) {
+ ret = kdbus_meta_proc_collect_cgroup(mp);
+ if (ret < 0)
+ goto exit_unlock;
+ mp->collected |= KDBUS_ATTACH_CGROUP;
+ }
+
+ if ((what & KDBUS_ATTACH_SECLABEL) &&
+ !(mp->collected & KDBUS_ATTACH_SECLABEL)) {
+ ret = kdbus_meta_proc_collect_seclabel(mp);
+ if (ret < 0)
+ goto exit_unlock;
+ mp->collected |= KDBUS_ATTACH_SECLABEL;
+ }
+
+ if ((what & KDBUS_ATTACH_AUDIT) &&
+ !(mp->collected & KDBUS_ATTACH_AUDIT)) {
+ kdbus_meta_proc_collect_audit(mp);
+ mp->collected |= KDBUS_ATTACH_AUDIT;
+ }
+
+ ret = 0;
+
+exit_unlock:
+ mutex_unlock(&mp->lock);
+ return ret;
+}
+
+/**
+ * kdbus_meta_fake_new() - Create fake metadata object
+ *
+ * Return: Pointer to new object on success, ERR_PTR on failure.
+ */
+struct kdbus_meta_fake *kdbus_meta_fake_new(void)
+{
+ struct kdbus_meta_fake *mf;
+
+ mf = kzalloc(sizeof(*mf), GFP_KERNEL);
+ if (!mf)
+ return ERR_PTR(-ENOMEM);
+
+ return mf;
+}
+
+/**
+ * kdbus_meta_fake_free() - Free fake metadata object
+ * @mf: Fake metadata object
+ *
+ * Return: NULL
+ */
+struct kdbus_meta_fake *kdbus_meta_fake_free(struct kdbus_meta_fake *mf)
+{
+ if (mf) {
+ put_pid(mf->ppid);
+ put_pid(mf->tgid);
+ put_pid(mf->pid);
+ kfree(mf->seclabel);
+ kfree(mf);
+ }
+
+ return NULL;
+}
+
+/**
+ * kdbus_meta_fake_collect() - Fill fake metadata from faked credentials
+ * @mf: Fake metadata object
+ * @creds: Creds to set, may be %NULL
+ * @pids: PIDs to set, may be %NULL
+ * @seclabel: Seclabel to set, may be %NULL
+ *
+ * This function takes information stored in @creds, @pids and @seclabel and
+ * resolves them to kernel-representations, if possible. This call uses the
+ * current task's namespaces to resolve the given information.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_meta_fake_collect(struct kdbus_meta_fake *mf,
+ const struct kdbus_creds *creds,
+ const struct kdbus_pids *pids,
+ const char *seclabel)
+{
+ if (mf->valid)
+ return -EALREADY;
+
+ if (creds) {
+ struct user_namespace *ns = current_user_ns();
+
+ mf->uid = make_kuid(ns, creds->uid);
+ mf->euid = make_kuid(ns, creds->euid);
+ mf->suid = make_kuid(ns, creds->suid);
+ mf->fsuid = make_kuid(ns, creds->fsuid);
+
+ mf->gid = make_kgid(ns, creds->gid);
+ mf->egid = make_kgid(ns, creds->egid);
+ mf->sgid = make_kgid(ns, creds->sgid);
+ mf->fsgid = make_kgid(ns, creds->fsgid);
+
+ if ((creds->uid != (uid_t)-1 && !uid_valid(mf->uid)) ||
+ (creds->euid != (uid_t)-1 && !uid_valid(mf->euid)) ||
+ (creds->suid != (uid_t)-1 && !uid_valid(mf->suid)) ||
+ (creds->fsuid != (uid_t)-1 && !uid_valid(mf->fsuid)) ||
+ (creds->gid != (gid_t)-1 && !gid_valid(mf->gid)) ||
+ (creds->egid != (gid_t)-1 && !gid_valid(mf->egid)) ||
+ (creds->sgid != (gid_t)-1 && !gid_valid(mf->sgid)) ||
+ (creds->fsgid != (gid_t)-1 && !gid_valid(mf->fsgid)))
+ return -EINVAL;
+
+ mf->valid |= KDBUS_ATTACH_CREDS;
+ }
+
+ if (pids) {
+ mf->pid = get_pid(find_vpid(pids->tid));
+ mf->tgid = get_pid(find_vpid(pids->pid));
+ mf->ppid = get_pid(find_vpid(pids->ppid));
+
+ if ((pids->tid != 0 && !mf->pid) ||
+ (pids->pid != 0 && !mf->tgid) ||
+ (pids->ppid != 0 && !mf->ppid)) {
+ put_pid(mf->pid);
+ put_pid(mf->tgid);
+ put_pid(mf->ppid);
+ mf->pid = NULL;
+ mf->tgid = NULL;
+ mf->ppid = NULL;
+ return -EINVAL;
+ }
+
+ mf->valid |= KDBUS_ATTACH_PIDS;
+ }
+
+ if (seclabel) {
+ mf->seclabel = kstrdup(seclabel, GFP_KERNEL);
+ if (!mf->seclabel)
+ return -ENOMEM;
+
+ mf->valid |= KDBUS_ATTACH_SECLABEL;
+ }
+
+ return 0;
+}
+
+/**
+ * kdbus_meta_conn_new() - Create connection metadata object
+ *
+ * Return: Pointer to new object on success, ERR_PTR on failure.
+ */
+struct kdbus_meta_conn *kdbus_meta_conn_new(void)
+{
+ struct kdbus_meta_conn *mc;
+
+ mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+ if (!mc)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&mc->kref);
+ mutex_init(&mc->lock);
+
+ return mc;
+}
+
+static void kdbus_meta_conn_free(struct kref *kref)
+{
+ struct kdbus_meta_conn *mc =
+ container_of(kref, struct kdbus_meta_conn, kref);
+
+ kfree(mc->conn_description);
+ kfree(mc->owned_names_items);
+ kfree(mc);
+}
+
+/**
+ * kdbus_meta_conn_ref() - Gain reference
+ * @mc: Connection metadata object
+ */
+struct kdbus_meta_conn *kdbus_meta_conn_ref(struct kdbus_meta_conn *mc)
+{
+ if (mc)
+ kref_get(&mc->kref);
+ return mc;
+}
+
+/**
+ * kdbus_meta_conn_unref() - Drop reference
+ * @mc: Connection metadata object
+ */
+struct kdbus_meta_conn *kdbus_meta_conn_unref(struct kdbus_meta_conn *mc)
+{
+ if (mc)
+ kref_put(&mc->kref, kdbus_meta_conn_free);
+ return NULL;
+}
+
+static void kdbus_meta_conn_collect_timestamp(struct kdbus_meta_conn *mc,
+ u64 msg_seqnum)
+{
+ mc->ts.monotonic_ns = ktime_get_ns();
+ mc->ts.realtime_ns = ktime_get_real_ns();
+
+ if (msg_seqnum)
+ mc->ts.seqnum = msg_seqnum;
+
+ mc->valid |= KDBUS_ATTACH_TIMESTAMP;
+}
+
+static int kdbus_meta_conn_collect_names(struct kdbus_meta_conn *mc,
+ struct kdbus_conn *conn)
+{
+ const struct kdbus_name_entry *e;
+ struct kdbus_item *item;
+ size_t slen, size;
+
+ lockdep_assert_held(&conn->ep->bus->name_registry->rwlock);
+
+ size = 0;
+ /* open-code length calculation to avoid final padding */
+ list_for_each_entry(e, &conn->names_list, conn_entry)
+ size = KDBUS_ALIGN8(size) + KDBUS_ITEM_HEADER_SIZE +
+ sizeof(struct kdbus_name) + strlen(e->name) + 1;
+
+ if (!size)
+ return 0;
+
+ /* make sure we include zeroed padding for convenience helpers */
+ item = kmalloc(KDBUS_ALIGN8(size), GFP_KERNEL);
+ if (!item)
+ return -ENOMEM;
+
+ mc->owned_names_items = item;
+ mc->owned_names_size = size;
+
+ list_for_each_entry(e, &conn->names_list, conn_entry) {
+ slen = strlen(e->name) + 1;
+ kdbus_item_set(item, KDBUS_ITEM_OWNED_NAME, NULL,
+ sizeof(struct kdbus_name) + slen);
+ item->name.flags = e->flags;
+ memcpy(item->name.name, e->name, slen);
+ item = KDBUS_ITEM_NEXT(item);
+ }
+
+ /* sanity check: the buffer should be completely written now */
+ WARN_ON((u8 *)item !=
+ (u8 *)mc->owned_names_items + KDBUS_ALIGN8(size));
+
+ mc->valid |= KDBUS_ATTACH_NAMES;
+ return 0;
+}
+
+static int kdbus_meta_conn_collect_description(struct kdbus_meta_conn *mc,
+ struct kdbus_conn *conn)
+{
+ if (!conn->description)
+ return 0;
+
+ mc->conn_description = kstrdup(conn->description, GFP_KERNEL);
+ if (!mc->conn_description)
+ return -ENOMEM;
+
+ mc->valid |= KDBUS_ATTACH_CONN_DESCRIPTION;
+ return 0;
+}
+
+/**
+ * kdbus_meta_conn_collect() - Collect connection metadata
+ * @mc: Message metadata object
+ * @conn: Connection to collect data from
+ * @msg_seqnum: Sequence number of the message to send
+ * @what: Attach flags to collect
+ *
+ * This collects connection metadata from @msg_seqnum and @conn and saves it
+ * in @mc.
+ *
+ * If KDBUS_ATTACH_NAMES is set in @what and @conn is non-NULL, the caller must
+ * hold the name-registry read-lock of conn->ep->bus->registry.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_meta_conn_collect(struct kdbus_meta_conn *mc,
+ struct kdbus_conn *conn,
+ u64 msg_seqnum, u64 what)
+{
+ int ret;
+
+ if (!mc || !(what & (KDBUS_ATTACH_TIMESTAMP |
+ KDBUS_ATTACH_NAMES |
+ KDBUS_ATTACH_CONN_DESCRIPTION)))
+ return 0;
+
+ mutex_lock(&mc->lock);
+
+ if (msg_seqnum && (what & KDBUS_ATTACH_TIMESTAMP) &&
+ !(mc->collected & KDBUS_ATTACH_TIMESTAMP)) {
+ kdbus_meta_conn_collect_timestamp(mc, msg_seqnum);
+ mc->collected |= KDBUS_ATTACH_TIMESTAMP;
+ }
+
+ if (conn && (what & KDBUS_ATTACH_NAMES) &&
+ !(mc->collected & KDBUS_ATTACH_NAMES)) {
+ ret = kdbus_meta_conn_collect_names(mc, conn);
+ if (ret < 0)
+ goto exit_unlock;
+ mc->collected |= KDBUS_ATTACH_NAMES;
+ }
+
+ if (conn && (what & KDBUS_ATTACH_CONN_DESCRIPTION) &&
+ !(mc->collected & KDBUS_ATTACH_CONN_DESCRIPTION)) {
+ ret = kdbus_meta_conn_collect_description(mc, conn);
+ if (ret < 0)
+ goto exit_unlock;
+ mc->collected |= KDBUS_ATTACH_CONN_DESCRIPTION;
+ }
+
+ ret = 0;
+
+exit_unlock:
+ mutex_unlock(&mc->lock);
+ return ret;
+}
+
+static void kdbus_meta_export_caps(struct kdbus_meta_caps *out,
+ const struct kdbus_meta_proc *mp,
+ struct user_namespace *user_ns)
+{
+ struct user_namespace *iter;
+ const struct cred *cred = mp->cred;
+ bool parent = false, owner = false;
+ int i;
+
+ /*
+ * This translates the effective capabilities of 'cred' into the given
+ * user-namespace. If the given user-namespace is a child-namespace of
+ * the user-namespace of 'cred', the mask can be copied verbatim. If
+ * not, the mask is cleared.
+ * There's one exception: If 'cred' is the owner of any user-namespace
+ * in the path between the given user-namespace and the user-namespace
+ * of 'cred', then it has all effective capabilities set. This means,
+ * the user who created a user-namespace always has all effective
+ * capabilities in any child namespaces. Note that this is based on the
+ * uid of the namespace creator, not the task hierarchy.
+ */
+ for (iter = user_ns; iter; iter = iter->parent) {
+ if (iter == cred->user_ns) {
+ parent = true;
+ break;
+ }
+
+ if (iter == &init_user_ns)
+ break;
+
+ if ((iter->parent == cred->user_ns) &&
+ uid_eq(iter->owner, cred->euid)) {
+ owner = true;
+ break;
+ }
+ }
+
+ out->last_cap = CAP_LAST_CAP;
+
+ CAP_FOR_EACH_U32(i) {
+ if (parent) {
+ out->set[0].caps[i] = cred->cap_inheritable.cap[i];
+ out->set[1].caps[i] = cred->cap_permitted.cap[i];
+ out->set[2].caps[i] = cred->cap_effective.cap[i];
+ out->set[3].caps[i] = cred->cap_bset.cap[i];
+ } else if (owner) {
+ out->set[0].caps[i] = 0U;
+ out->set[1].caps[i] = ~0U;
+ out->set[2].caps[i] = ~0U;
+ out->set[3].caps[i] = ~0U;
+ } else {
+ out->set[0].caps[i] = 0U;
+ out->set[1].caps[i] = 0U;
+ out->set[2].caps[i] = 0U;
+ out->set[3].caps[i] = 0U;
+ }
+ }
+
+ /* clear unused bits */
+ for (i = 0; i < 4; i++)
+ out->set[i].caps[CAP_TO_INDEX(CAP_LAST_CAP)] &=
+ CAP_LAST_U32_VALID_MASK;
+}
+
+/* This is equivalent to from_kuid_munged(), but maps INVALID_UID to itself */
+static uid_t kdbus_from_kuid_keep(struct user_namespace *ns, kuid_t uid)
+{
+ return uid_valid(uid) ? from_kuid_munged(ns, uid) : ((uid_t)-1);
+}
+
+/* This is equivalent to from_kgid_munged(), but maps INVALID_GID to itself */
+static gid_t kdbus_from_kgid_keep(struct user_namespace *ns, kgid_t gid)
+{
+ return gid_valid(gid) ? from_kgid_munged(ns, gid) : ((gid_t)-1);
+}
+
+struct kdbus_meta_staging {
+ const struct kdbus_meta_proc *mp;
+ const struct kdbus_meta_fake *mf;
+ const struct kdbus_meta_conn *mc;
+ const struct kdbus_conn *conn;
+ u64 mask;
+
+ void *exe;
+ const char *exe_path;
+};
+
+static size_t kdbus_meta_measure(struct kdbus_meta_staging *staging)
+{
+ const struct kdbus_meta_proc *mp = staging->mp;
+ const struct kdbus_meta_fake *mf = staging->mf;
+ const struct kdbus_meta_conn *mc = staging->mc;
+ const u64 mask = staging->mask;
+ size_t size = 0;
+
+ /* process metadata */
+
+ if (mf && (mask & KDBUS_ATTACH_CREDS))
+ size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_creds));
+ else if (mp && (mask & KDBUS_ATTACH_CREDS))
+ size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_creds));
+
+ if (mf && (mask & KDBUS_ATTACH_PIDS))
+ size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_pids));
+ else if (mp && (mask & KDBUS_ATTACH_PIDS))
+ size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_pids));
+
+ if (mp && (mask & KDBUS_ATTACH_AUXGROUPS))
+ size += KDBUS_ITEM_SIZE(mp->cred->group_info->ngroups *
+ sizeof(u64));
+
+ if (mp && (mask & KDBUS_ATTACH_TID_COMM))
+ size += KDBUS_ITEM_SIZE(strlen(mp->tid_comm) + 1);
+
+ if (mp && (mask & KDBUS_ATTACH_PID_COMM))
+ size += KDBUS_ITEM_SIZE(strlen(mp->pid_comm) + 1);
+
+ if (staging->exe_path && (mask & KDBUS_ATTACH_EXE))
+ size += KDBUS_ITEM_SIZE(strlen(staging->exe_path) + 1);
+
+ if (mp && (mask & KDBUS_ATTACH_CMDLINE))
+ size += KDBUS_ITEM_SIZE(strlen(mp->cmdline) + 1);
+
+ if (mp && (mask & KDBUS_ATTACH_CGROUP))
+ size += KDBUS_ITEM_SIZE(strlen(mp->cgroup) + 1);
+
+ if (mp && (mask & KDBUS_ATTACH_CAPS))
+ size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_meta_caps));
+
+ if (mf && (mask & KDBUS_ATTACH_SECLABEL))
+ size += KDBUS_ITEM_SIZE(strlen(mf->seclabel) + 1);
+ else if (mp && (mask & KDBUS_ATTACH_SECLABEL))
+ size += KDBUS_ITEM_SIZE(strlen(mp->seclabel) + 1);
+
+ if (mp && (mask & KDBUS_ATTACH_AUDIT))
+ size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_audit));
+
+ /* connection metadata */
+
+ if (mc && (mask & KDBUS_ATTACH_NAMES))
+ size += KDBUS_ALIGN8(mc->owned_names_size);
+
+ if (mc && (mask & KDBUS_ATTACH_CONN_DESCRIPTION))
+ size += KDBUS_ITEM_SIZE(strlen(mc->conn_description) + 1);
+
+ if (mc && (mask & KDBUS_ATTACH_TIMESTAMP))
+ size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_timestamp));
+
+ return size;
+}
+
+static struct kdbus_item *kdbus_write_head(struct kdbus_item **iter,
+ u64 type, u64 size)
+{
+ struct kdbus_item *item = *iter;
+ size_t padding;
+
+ item->type = type;
+ item->size = KDBUS_ITEM_HEADER_SIZE + size;
+
+ /* clear padding */
+ padding = KDBUS_ALIGN8(item->size) - item->size;
+ if (padding)
+ memset(item->data + size, 0, padding);
+
+ *iter = KDBUS_ITEM_NEXT(item);
+ return item;
+}
+
+static struct kdbus_item *kdbus_write_full(struct kdbus_item **iter,
+ u64 type, u64 size, const void *data)
+{
+ struct kdbus_item *item;
+
+ item = kdbus_write_head(iter, type, size);
+ memcpy(item->data, data, size);
+ return item;
+}
+
+static size_t kdbus_meta_write(struct kdbus_meta_staging *staging, void *mem,
+ size_t size)
+{
+ struct user_namespace *user_ns = staging->conn->cred->user_ns;
+ struct pid_namespace *pid_ns = ns_of_pid(staging->conn->pid);
+ struct kdbus_item *item = NULL, *items = mem;
+ u8 *end, *owned_names_end = NULL;
+
+ /* process metadata */
+
+ if (staging->mf && (staging->mask & KDBUS_ATTACH_CREDS)) {
+ const struct kdbus_meta_fake *mf = staging->mf;
+
+ item = kdbus_write_head(&items, KDBUS_ITEM_CREDS,
+ sizeof(struct kdbus_creds));
+ item->creds = (struct kdbus_creds){
+ .uid = kdbus_from_kuid_keep(user_ns, mf->uid),
+ .euid = kdbus_from_kuid_keep(user_ns, mf->euid),
+ .suid = kdbus_from_kuid_keep(user_ns, mf->suid),
+ .fsuid = kdbus_from_kuid_keep(user_ns, mf->fsuid),
+ .gid = kdbus_from_kgid_keep(user_ns, mf->gid),
+ .egid = kdbus_from_kgid_keep(user_ns, mf->egid),
+ .sgid = kdbus_from_kgid_keep(user_ns, mf->sgid),
+ .fsgid = kdbus_from_kgid_keep(user_ns, mf->fsgid),
+ };
+ } else if (staging->mp && (staging->mask & KDBUS_ATTACH_CREDS)) {
+ const struct cred *c = staging->mp->cred;
+
+ item = kdbus_write_head(&items, KDBUS_ITEM_CREDS,
+ sizeof(struct kdbus_creds));
+ item->creds = (struct kdbus_creds){
+ .uid = kdbus_from_kuid_keep(user_ns, c->uid),
+ .euid = kdbus_from_kuid_keep(user_ns, c->euid),
+ .suid = kdbus_from_kuid_keep(user_ns, c->suid),
+ .fsuid = kdbus_from_kuid_keep(user_ns, c->fsuid),
+ .gid = kdbus_from_kgid_keep(user_ns, c->gid),
+ .egid = kdbus_from_kgid_keep(user_ns, c->egid),
+ .sgid = kdbus_from_kgid_keep(user_ns, c->sgid),
+ .fsgid = kdbus_from_kgid_keep(user_ns, c->fsgid),
+ };
+ }
+
+ if (staging->mf && (staging->mask & KDBUS_ATTACH_PIDS)) {
+ item = kdbus_write_head(&items, KDBUS_ITEM_PIDS,
+ sizeof(struct kdbus_pids));
+ item->pids = (struct kdbus_pids){
+ .pid = pid_nr_ns(staging->mf->tgid, pid_ns),
+ .tid = pid_nr_ns(staging->mf->pid, pid_ns),
+ .ppid = pid_nr_ns(staging->mf->ppid, pid_ns),
+ };
+ } else if (staging->mp && (staging->mask & KDBUS_ATTACH_PIDS)) {
+ item = kdbus_write_head(&items, KDBUS_ITEM_PIDS,
+ sizeof(struct kdbus_pids));
+ item->pids = (struct kdbus_pids){
+ .pid = pid_nr_ns(staging->mp->tgid, pid_ns),
+ .tid = pid_nr_ns(staging->mp->pid, pid_ns),
+ .ppid = pid_nr_ns(staging->mp->ppid, pid_ns),
+ };
+ }
+
+ if (staging->mp && (staging->mask & KDBUS_ATTACH_AUXGROUPS)) {
+ const struct group_info *info = staging->mp->cred->group_info;
+ size_t i;
+
+ item = kdbus_write_head(&items, KDBUS_ITEM_AUXGROUPS,
+ info->ngroups * sizeof(u64));
+ for (i = 0; i < info->ngroups; ++i)
+ item->data64[i] = from_kgid_munged(user_ns,
+ GROUP_AT(info, i));
+ }
+
+ if (staging->mp && (staging->mask & KDBUS_ATTACH_TID_COMM))
+ item = kdbus_write_full(&items, KDBUS_ITEM_TID_COMM,
+ strlen(staging->mp->tid_comm) + 1,
+ staging->mp->tid_comm);
+
+ if (staging->mp && (staging->mask & KDBUS_ATTACH_PID_COMM))
+ item = kdbus_write_full(&items, KDBUS_ITEM_PID_COMM,
+ strlen(staging->mp->pid_comm) + 1,
+ staging->mp->pid_comm);
+
+ if (staging->exe_path && (staging->mask & KDBUS_ATTACH_EXE))
+ item = kdbus_write_full(&items, KDBUS_ITEM_EXE,
+ strlen(staging->exe_path) + 1,
+ staging->exe_path);
+
+ if (staging->mp && (staging->mask & KDBUS_ATTACH_CMDLINE))
+ item = kdbus_write_full(&items, KDBUS_ITEM_CMDLINE,
+ strlen(staging->mp->cmdline) + 1,
+ staging->mp->cmdline);
+
+ if (staging->mp && (staging->mask & KDBUS_ATTACH_CGROUP))
+ item = kdbus_write_full(&items, KDBUS_ITEM_CGROUP,
+ strlen(staging->mp->cgroup) + 1,
+ staging->mp->cgroup);
+
+ if (staging->mp && (staging->mask & KDBUS_ATTACH_CAPS)) {
+ item = kdbus_write_head(&items, KDBUS_ITEM_CAPS,
+ sizeof(struct kdbus_meta_caps));
+ kdbus_meta_export_caps((void*)&item->caps, staging->mp,
+ user_ns);
+ }
+
+ if (staging->mf && (staging->mask & KDBUS_ATTACH_SECLABEL))
+ item = kdbus_write_full(&items, KDBUS_ITEM_SECLABEL,
+ strlen(staging->mf->seclabel) + 1,
+ staging->mf->seclabel);
+ else if (staging->mp && (staging->mask & KDBUS_ATTACH_SECLABEL))
+ item = kdbus_write_full(&items, KDBUS_ITEM_SECLABEL,
+ strlen(staging->mp->seclabel) + 1,
+ staging->mp->seclabel);
+
+ if (staging->mp && (staging->mask & KDBUS_ATTACH_AUDIT)) {
+ item = kdbus_write_head(&items, KDBUS_ITEM_AUDIT,
+ sizeof(struct kdbus_audit));
+ item->audit = (struct kdbus_audit){
+ .loginuid = from_kuid(user_ns,
+ staging->mp->audit_loginuid),
+ .sessionid = staging->mp->audit_sessionid,
+ };
+ }
+
+ /* connection metadata */
+
+ if (staging->mc && (staging->mask & KDBUS_ATTACH_NAMES)) {
+ memcpy(items, staging->mc->owned_names_items,
+ KDBUS_ALIGN8(staging->mc->owned_names_size));
+ owned_names_end = (u8 *)items + staging->mc->owned_names_size;
+ items = (void *)KDBUS_ALIGN8((unsigned long)owned_names_end);
+ }
+
+ if (staging->mc && (staging->mask & KDBUS_ATTACH_CONN_DESCRIPTION))
+ item = kdbus_write_full(&items, KDBUS_ITEM_CONN_DESCRIPTION,
+ strlen(staging->mc->conn_description) + 1,
+ staging->mc->conn_description);
+
+ if (staging->mc && (staging->mask & KDBUS_ATTACH_TIMESTAMP))
+ item = kdbus_write_full(&items, KDBUS_ITEM_TIMESTAMP,
+ sizeof(staging->mc->ts),
+ &staging->mc->ts);
+
+ /*
+ * Return real size (minus trailing padding). In case of 'owned_names'
+ * we cannot deduce it from item->size, so treat it special.
+ */
+
+ if (items == (void *)KDBUS_ALIGN8((unsigned long)owned_names_end))
+ end = owned_names_end;
+ else if (item)
+ end = (u8 *)item + item->size;
+ else
+ end = mem;
+
+ WARN_ON((u8 *)items - (u8 *)mem != size);
+ WARN_ON((void *)KDBUS_ALIGN8((unsigned long)end) != (void *)items);
+
+ return end - (u8 *)mem;
+}
+
+int kdbus_meta_emit(struct kdbus_meta_proc *mp,
+ struct kdbus_meta_fake *mf,
+ struct kdbus_meta_conn *mc,
+ struct kdbus_conn *conn,
+ u64 mask,
+ struct kdbus_item **out_items,
+ size_t *out_size)
+{
+ struct kdbus_meta_staging staging = {};
+ struct kdbus_item *items = NULL;
+ size_t size = 0;
+ int ret;
+
+ if (WARN_ON(mf && mp))
+ mp = NULL;
+
+ staging.mp = mp;
+ staging.mf = mf;
+ staging.mc = mc;
+ staging.conn = conn;
+
+ /* get mask of valid items */
+ if (mf)
+ staging.mask |= mf->valid;
+ if (mp) {
+ mutex_lock(&mp->lock);
+ staging.mask |= mp->valid;
+ mutex_unlock(&mp->lock);
+ }
+ if (mc) {
+ mutex_lock(&mc->lock);
+ staging.mask |= mc->valid;
+ mutex_unlock(&mc->lock);
+ }
+
+ staging.mask &= mask;
+
+ if (!staging.mask) { /* bail out if nothing to do */
+ ret = 0;
+ goto exit;
+ }
+
+ /* EXE is special as it needs a temporary page to assemble */
+ if (mp && (staging.mask & KDBUS_ATTACH_EXE)) {
+ struct path p;
+
+ /*
+ * XXX: We need access to __d_path() so we can write the path
+ * relative to conn->root_path. Once upstream, we need
+ * EXPORT_SYMBOL(__d_path) or an equivalent of d_path() that
+ * takes the root path directly. Until then, we drop this item
+ * if the root-paths differ.
+ */
+
+ get_fs_root(current->fs, &p);
+ if (path_equal(&p, &conn->root_path)) {
+ staging.exe = (void *)__get_free_page(GFP_TEMPORARY);
+ if (!staging.exe) {
+ path_put(&p);
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ staging.exe_path = d_path(&mp->exe_path, staging.exe,
+ PAGE_SIZE);
+ if (IS_ERR(staging.exe_path)) {
+ path_put(&p);
+ ret = PTR_ERR(staging.exe_path);
+ goto exit;
+ }
+ }
+ path_put(&p);
+ }
+
+ size = kdbus_meta_measure(&staging);
+ if (!size) { /* bail out if nothing to do */
+ ret = 0;
+ goto exit;
+ }
+
+ items = kmalloc(size, GFP_KERNEL);
+ if (!items) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ size = kdbus_meta_write(&staging, items, size);
+ if (!size) {
+ kfree(items);
+ items = NULL;
+ }
+
+ ret = 0;
+
+exit:
+ if (staging.exe)
+ free_page((unsigned long)staging.exe);
+ if (ret >= 0) {
+ *out_items = items;
+ *out_size = size;
+ }
+ return ret;
+}
+
+enum {
+ KDBUS_META_PROC_NONE,
+ KDBUS_META_PROC_NORMAL,
+};
+
+/**
+ * kdbus_proc_permission() - check /proc permissions on target pid
+ * @pid_ns: namespace we operate in
+ * @cred: credentials of requestor
+ * @target: target process
+ *
+ * This checks whether a process with credentials @cred can access information
+ * of @target in the namespace @pid_ns. This tries to follow /proc permissions,
+ * but is slightly more restrictive.
+ *
+ * Return: The /proc access level (KDBUS_META_PROC_*) is returned.
+ */
+static unsigned int kdbus_proc_permission(const struct pid_namespace *pid_ns,
+ const struct cred *cred,
+ struct pid *target)
+{
+ if (pid_ns->hide_pid < 1)
+ return KDBUS_META_PROC_NORMAL;
+
+ /* XXX: we need groups_search() exported for aux-groups */
+ if (gid_eq(cred->egid, pid_ns->pid_gid))
+ return KDBUS_META_PROC_NORMAL;
+
+ /*
+ * XXX: If ptrace_may_access(PTRACE_MODE_READ) is granted, you can
+ * overwrite hide_pid. However, ptrace_may_access() only supports
+ * checking 'current', hence, we cannot use this here. But we
+ * simply decide to not support this override, so no need to worry.
+ */
+
+ return KDBUS_META_PROC_NONE;
+}
+
+/**
+ * kdbus_meta_proc_mask() - calculate which metadata would be visible to
+ * a connection via /proc
+ * @prv_pid: pid of metadata provider
+ * @req_pid: pid of metadata requestor
+ * @req_cred: credentials of metadata reqeuestor
+ * @wanted: metadata that is requested
+ *
+ * This checks which metadata items of @prv_pid can be read via /proc by the
+ * requestor @req_pid.
+ *
+ * Return: Set of metadata flags the requestor can see (limited by @wanted).
+ */
+static u64 kdbus_meta_proc_mask(struct pid *prv_pid,
+ struct pid *req_pid,
+ const struct cred *req_cred,
+ u64 wanted)
+{
+ struct pid_namespace *prv_ns, *req_ns;
+ unsigned int proc;
+
+ prv_ns = ns_of_pid(prv_pid);
+ req_ns = ns_of_pid(req_pid);
+
+ /*
+ * If the sender is not visible in the receiver namespace, then the
+ * receiver cannot access the sender via its own procfs. Hence, we do
+ * not attach any additional metadata.
+ */
+ if (!pid_nr_ns(prv_pid, req_ns))
+ return 0;
+
+ /*
+ * If the pid-namespace of the receiver has hide_pid set, it cannot see
+ * any process but its own. We shortcut this /proc permission check if
+ * provider and requestor are the same. If not, we perform rather
+ * expensive /proc permission checks.
+ */
+ if (prv_pid == req_pid)
+ proc = KDBUS_META_PROC_NORMAL;
+ else
+ proc = kdbus_proc_permission(req_ns, req_cred, prv_pid);
+
+ /* you need /proc access to read standard process attributes */
+ if (proc < KDBUS_META_PROC_NORMAL)
+ wanted &= ~(KDBUS_ATTACH_TID_COMM |
+ KDBUS_ATTACH_PID_COMM |
+ KDBUS_ATTACH_SECLABEL |
+ KDBUS_ATTACH_CMDLINE |
+ KDBUS_ATTACH_CGROUP |
+ KDBUS_ATTACH_AUDIT |
+ KDBUS_ATTACH_CAPS |
+ KDBUS_ATTACH_EXE);
+
+ /* clear all non-/proc flags */
+ return wanted & (KDBUS_ATTACH_TID_COMM |
+ KDBUS_ATTACH_PID_COMM |
+ KDBUS_ATTACH_SECLABEL |
+ KDBUS_ATTACH_CMDLINE |
+ KDBUS_ATTACH_CGROUP |
+ KDBUS_ATTACH_AUDIT |
+ KDBUS_ATTACH_CAPS |
+ KDBUS_ATTACH_EXE);
+}
+
+/**
+ * kdbus_meta_get_mask() - calculate attach flags mask for metadata request
+ * @prv_pid: pid of metadata provider
+ * @prv_mask: mask of metadata the provide grants unchecked
+ * @req_pid: pid of metadata requestor
+ * @req_cred: credentials of metadata requestor
+ * @req_mask: mask of metadata that is requested
+ *
+ * This calculates the metadata items that the requestor @req_pid can access
+ * from the metadata provider @prv_pid. This permission check consists of
+ * several different parts:
+ * - Providers can grant metadata items unchecked. Regardless of their type,
+ * they're always granted to the requestor. This mask is passed as @prv_mask.
+ * - Basic items (credentials and connection metadata) are granted implicitly
+ * to everyone. They're publicly available to any bus-user that can see the
+ * provider.
+ * - Process credentials that are not granted implicitly follow the same
+ * permission checks as /proc. This means, we always assume a requestor
+ * process has access to their *own* /proc mount, if they have access to
+ * kdbusfs.
+ *
+ * Return: Mask of metadata that is granted.
+ */
+static u64 kdbus_meta_get_mask(struct pid *prv_pid, u64 prv_mask,
+ struct pid *req_pid,
+ const struct cred *req_cred, u64 req_mask)
+{
+ u64 missing, impl_mask, proc_mask = 0;
+
+ /*
+ * Connection metadata and basic unix process credentials are
+ * transmitted implicitly, and cannot be suppressed. Both are required
+ * to perform user-space policies on the receiver-side. Furthermore,
+ * connection metadata is public state, anyway, and unix credentials
+ * are needed for UDS-compatibility. We extend them slightly by
+ * auxiliary groups and additional uids/gids/pids.
+ */
+ impl_mask = /* connection metadata */
+ KDBUS_ATTACH_CONN_DESCRIPTION |
+ KDBUS_ATTACH_TIMESTAMP |
+ KDBUS_ATTACH_NAMES |
+ /* credentials and pids */
+ KDBUS_ATTACH_AUXGROUPS |
+ KDBUS_ATTACH_CREDS |
+ KDBUS_ATTACH_PIDS;
+
+ /*
+ * Calculate the set of metadata that is not granted implicitly nor by
+ * the sender, but still requested by the receiver. If any are left,
+ * perform rather expensive /proc access checks for them.
+ */
+ missing = req_mask & ~((prv_mask | impl_mask) & req_mask);
+ if (missing)
+ proc_mask = kdbus_meta_proc_mask(prv_pid, req_pid, req_cred,
+ missing);
+
+ return (prv_mask | impl_mask | proc_mask) & req_mask;
+}
+
+/**
+ */
+u64 kdbus_meta_info_mask(const struct kdbus_conn *conn, u64 mask)
+{
+ return kdbus_meta_get_mask(conn->pid,
+ atomic64_read(&conn->attach_flags_send),
+ task_pid(current),
+ current_cred(),
+ mask);
+}
+
+/**
+ */
+u64 kdbus_meta_msg_mask(const struct kdbus_conn *snd,
+ const struct kdbus_conn *rcv)
+{
+ return kdbus_meta_get_mask(task_pid(current),
+ atomic64_read(&snd->attach_flags_send),
+ rcv->pid,
+ rcv->cred,
+ atomic64_read(&rcv->attach_flags_recv));
+}
diff --git a/ipc/kdbus/metadata.h b/ipc/kdbus/metadata.h
new file mode 100644
index 000000000..dba7cc7fd
--- /dev/null
+++ b/ipc/kdbus/metadata.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_METADATA_H
+#define __KDBUS_METADATA_H
+
+#include <linux/kernel.h>
+
+struct kdbus_conn;
+struct kdbus_pool_slice;
+
+struct kdbus_meta_proc;
+struct kdbus_meta_conn;
+
+/**
+ * struct kdbus_meta_fake - Fake metadata
+ * @valid: Bitmask of collected and valid items
+ * @uid: UID of process
+ * @euid: EUID of process
+ * @suid: SUID of process
+ * @fsuid: FSUID of process
+ * @gid: GID of process
+ * @egid: EGID of process
+ * @sgid: SGID of process
+ * @fsgid: FSGID of process
+ * @pid: PID of process
+ * @tgid: TGID of process
+ * @ppid: PPID of process
+ * @seclabel: Seclabel
+ */
+struct kdbus_meta_fake {
+ u64 valid;
+
+ /* KDBUS_ITEM_CREDS */
+ kuid_t uid, euid, suid, fsuid;
+ kgid_t gid, egid, sgid, fsgid;
+
+ /* KDBUS_ITEM_PIDS */
+ struct pid *pid, *tgid, *ppid;
+
+ /* KDBUS_ITEM_SECLABEL */
+ char *seclabel;
+};
+
+struct kdbus_meta_proc *kdbus_meta_proc_new(void);
+struct kdbus_meta_proc *kdbus_meta_proc_ref(struct kdbus_meta_proc *mp);
+struct kdbus_meta_proc *kdbus_meta_proc_unref(struct kdbus_meta_proc *mp);
+int kdbus_meta_proc_collect(struct kdbus_meta_proc *mp, u64 what);
+
+struct kdbus_meta_fake *kdbus_meta_fake_new(void);
+struct kdbus_meta_fake *kdbus_meta_fake_free(struct kdbus_meta_fake *mf);
+int kdbus_meta_fake_collect(struct kdbus_meta_fake *mf,
+ const struct kdbus_creds *creds,
+ const struct kdbus_pids *pids,
+ const char *seclabel);
+
+struct kdbus_meta_conn *kdbus_meta_conn_new(void);
+struct kdbus_meta_conn *kdbus_meta_conn_ref(struct kdbus_meta_conn *mc);
+struct kdbus_meta_conn *kdbus_meta_conn_unref(struct kdbus_meta_conn *mc);
+int kdbus_meta_conn_collect(struct kdbus_meta_conn *mc,
+ struct kdbus_conn *conn,
+ u64 msg_seqnum, u64 what);
+
+int kdbus_meta_emit(struct kdbus_meta_proc *mp,
+ struct kdbus_meta_fake *mf,
+ struct kdbus_meta_conn *mc,
+ struct kdbus_conn *conn,
+ u64 mask,
+ struct kdbus_item **out_items,
+ size_t *out_size);
+u64 kdbus_meta_info_mask(const struct kdbus_conn *conn, u64 mask);
+u64 kdbus_meta_msg_mask(const struct kdbus_conn *snd,
+ const struct kdbus_conn *rcv);
+
+#endif
diff --git a/ipc/kdbus/names.c b/ipc/kdbus/names.c
new file mode 100644
index 000000000..057f8061c
--- /dev/null
+++ b/ipc/kdbus/names.c
@@ -0,0 +1,770 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/hash.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "names.h"
+#include "notify.h"
+#include "policy.h"
+
+struct kdbus_name_pending {
+ u64 flags;
+ struct kdbus_conn *conn;
+ struct kdbus_name_entry *name;
+ struct list_head conn_entry;
+ struct list_head name_entry;
+};
+
+static int kdbus_name_pending_new(struct kdbus_name_entry *e,
+ struct kdbus_conn *conn, u64 flags)
+{
+ struct kdbus_name_pending *p;
+
+ kdbus_conn_assert_active(conn);
+
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->flags = flags;
+ p->conn = conn;
+ p->name = e;
+ list_add_tail(&p->conn_entry, &conn->names_queue_list);
+ list_add_tail(&p->name_entry, &e->queue);
+
+ return 0;
+}
+
+static void kdbus_name_pending_free(struct kdbus_name_pending *p)
+{
+ if (!p)
+ return;
+
+ list_del(&p->name_entry);
+ list_del(&p->conn_entry);
+ kfree(p);
+}
+
+static struct kdbus_name_entry *
+kdbus_name_entry_new(struct kdbus_name_registry *r, u32 hash, const char *name)
+{
+ struct kdbus_name_entry *e;
+ size_t namelen;
+
+ namelen = strlen(name);
+
+ e = kmalloc(sizeof(*e) + namelen + 1, GFP_KERNEL);
+ if (!e)
+ return ERR_PTR(-ENOMEM);
+
+ e->name_id = ++r->name_seq_last;
+ e->flags = 0;
+ e->conn = NULL;
+ e->activator = NULL;
+ INIT_LIST_HEAD(&e->queue);
+ INIT_LIST_HEAD(&e->conn_entry);
+ hash_add(r->entries_hash, &e->hentry, hash);
+ memcpy(e->name, name, namelen + 1);
+
+ return e;
+}
+
+static void kdbus_name_entry_free(struct kdbus_name_entry *e)
+{
+ if (!e)
+ return;
+
+ WARN_ON(!list_empty(&e->conn_entry));
+ WARN_ON(!list_empty(&e->queue));
+ WARN_ON(e->activator);
+ WARN_ON(e->conn);
+
+ hash_del(&e->hentry);
+ kfree(e);
+}
+
+static void kdbus_name_entry_set_owner(struct kdbus_name_entry *e,
+ struct kdbus_conn *conn, u64 flags)
+{
+ WARN_ON(e->conn);
+
+ e->conn = kdbus_conn_ref(conn);
+ e->flags = flags;
+ atomic_inc(&conn->name_count);
+ list_add_tail(&e->conn_entry, &e->conn->names_list);
+}
+
+static void kdbus_name_entry_remove_owner(struct kdbus_name_entry *e)
+{
+ WARN_ON(!e->conn);
+
+ list_del_init(&e->conn_entry);
+ atomic_dec(&e->conn->name_count);
+ e->flags = 0;
+ e->conn = kdbus_conn_unref(e->conn);
+}
+
+static void kdbus_name_entry_replace_owner(struct kdbus_name_entry *e,
+ struct kdbus_conn *conn, u64 flags)
+{
+ if (WARN_ON(!e->conn) || WARN_ON(conn == e->conn))
+ return;
+
+ kdbus_notify_name_change(conn->ep->bus, KDBUS_ITEM_NAME_CHANGE,
+ e->conn->id, conn->id,
+ e->flags, flags, e->name);
+ kdbus_name_entry_remove_owner(e);
+ kdbus_name_entry_set_owner(e, conn, flags);
+}
+
+/**
+ * kdbus_name_is_valid() - check if a name is valid
+ * @p: The name to check
+ * @allow_wildcard: Whether or not to allow a wildcard name
+ *
+ * A name is valid if all of the following criterias are met:
+ *
+ * - The name has two or more elements separated by a period ('.') character.
+ * - All elements must contain at least one character.
+ * - Each element must only contain the ASCII characters "[A-Z][a-z][0-9]_-"
+ * and must not begin with a digit.
+ * - The name must not exceed KDBUS_NAME_MAX_LEN.
+ * - If @allow_wildcard is true, the name may end on '.*'
+ */
+bool kdbus_name_is_valid(const char *p, bool allow_wildcard)
+{
+ bool dot, found_dot = false;
+ const char *q;
+
+ for (dot = true, q = p; *q; q++) {
+ if (*q == '.') {
+ if (dot)
+ return false;
+
+ found_dot = true;
+ dot = true;
+ } else {
+ bool good;
+
+ good = isalpha(*q) || (!dot && isdigit(*q)) ||
+ *q == '_' || *q == '-' ||
+ (allow_wildcard && dot &&
+ *q == '*' && *(q + 1) == '\0');
+
+ if (!good)
+ return false;
+
+ dot = false;
+ }
+ }
+
+ if (q - p > KDBUS_NAME_MAX_LEN)
+ return false;
+
+ if (dot)
+ return false;
+
+ if (!found_dot)
+ return false;
+
+ return true;
+}
+
+/**
+ * kdbus_name_registry_new() - create a new name registry
+ *
+ * Return: a new kdbus_name_registry on success, ERR_PTR on failure.
+ */
+struct kdbus_name_registry *kdbus_name_registry_new(void)
+{
+ struct kdbus_name_registry *r;
+
+ r = kmalloc(sizeof(*r), GFP_KERNEL);
+ if (!r)
+ return ERR_PTR(-ENOMEM);
+
+ hash_init(r->entries_hash);
+ init_rwsem(&r->rwlock);
+ r->name_seq_last = 0;
+
+ return r;
+}
+
+/**
+ * kdbus_name_registry_free() - drop a name reg's reference
+ * @reg: The name registry, may be %NULL
+ *
+ * Cleanup the name registry's internal structures.
+ */
+void kdbus_name_registry_free(struct kdbus_name_registry *reg)
+{
+ if (!reg)
+ return;
+
+ WARN_ON(!hash_empty(reg->entries_hash));
+ kfree(reg);
+}
+
+static struct kdbus_name_entry *
+kdbus_name_find(struct kdbus_name_registry *reg, u32 hash, const char *name)
+{
+ struct kdbus_name_entry *e;
+
+ lockdep_assert_held(&reg->rwlock);
+
+ hash_for_each_possible(reg->entries_hash, e, hentry, hash)
+ if (strcmp(e->name, name) == 0)
+ return e;
+
+ return NULL;
+}
+
+/**
+ * kdbus_name_lookup_unlocked() - lookup name in registry
+ * @reg: name registry
+ * @name: name to lookup
+ *
+ * This looks up @name in the given name-registry and returns the
+ * kdbus_name_entry object. The caller must hold the registry-lock and must not
+ * access the returned object after releasing the lock.
+ *
+ * Return: Pointer to name-entry, or NULL if not found.
+ */
+struct kdbus_name_entry *
+kdbus_name_lookup_unlocked(struct kdbus_name_registry *reg, const char *name)
+{
+ return kdbus_name_find(reg, kdbus_strhash(name), name);
+}
+
+/**
+ * kdbus_name_acquire() - acquire a name
+ * @reg: The name registry
+ * @conn: The connection to pin this entry to
+ * @name: The name to acquire
+ * @flags: Acquisition flags (KDBUS_NAME_*)
+ * @return_flags: Pointer to return flags for the acquired name
+ * (KDBUS_NAME_*), may be %NULL
+ *
+ * Callers must ensure that @conn is either a privileged bus user or has
+ * sufficient privileges in the policy-db to own the well-known name @name.
+ *
+ * Return: 0 success, negative error number on failure.
+ */
+int kdbus_name_acquire(struct kdbus_name_registry *reg,
+ struct kdbus_conn *conn, const char *name,
+ u64 flags, u64 *return_flags)
+{
+ struct kdbus_name_entry *e;
+ u64 rflags = 0;
+ int ret = 0;
+ u32 hash;
+
+ kdbus_conn_assert_active(conn);
+
+ down_write(&reg->rwlock);
+
+ if (!kdbus_conn_policy_own_name(conn, current_cred(), name)) {
+ ret = -EPERM;
+ goto exit_unlock;
+ }
+
+ hash = kdbus_strhash(name);
+ e = kdbus_name_find(reg, hash, name);
+ if (!e) {
+ /* claim new name */
+
+ if (conn->activator_of) {
+ ret = -EINVAL;
+ goto exit_unlock;
+ }
+
+ e = kdbus_name_entry_new(reg, hash, name);
+ if (IS_ERR(e)) {
+ ret = PTR_ERR(e);
+ goto exit_unlock;
+ }
+
+ if (kdbus_conn_is_activator(conn)) {
+ e->activator = kdbus_conn_ref(conn);
+ conn->activator_of = e;
+ }
+
+ kdbus_name_entry_set_owner(e, conn, flags);
+ kdbus_notify_name_change(e->conn->ep->bus, KDBUS_ITEM_NAME_ADD,
+ 0, e->conn->id, 0, e->flags, e->name);
+ } else if (e->conn == conn || e == conn->activator_of) {
+ /* connection already owns that name */
+ ret = -EALREADY;
+ } else if (kdbus_conn_is_activator(conn)) {
+ /* activator claims existing name */
+
+ if (conn->activator_of) {
+ ret = -EINVAL; /* multiple names not allowed */
+ } else if (e->activator) {
+ ret = -EEXIST; /* only one activator per name */
+ } else {
+ e->activator = kdbus_conn_ref(conn);
+ conn->activator_of = e;
+ }
+ } else if (e->flags & KDBUS_NAME_ACTIVATOR) {
+ /* claim name of an activator */
+
+ kdbus_conn_move_messages(conn, e->activator, 0);
+ kdbus_name_entry_replace_owner(e, conn, flags);
+ } else if ((flags & KDBUS_NAME_REPLACE_EXISTING) &&
+ (e->flags & KDBUS_NAME_ALLOW_REPLACEMENT)) {
+ /* claim name of a previous owner */
+
+ if (e->flags & KDBUS_NAME_QUEUE) {
+ /* move owner back to queue if they asked for it */
+ ret = kdbus_name_pending_new(e, e->conn, e->flags);
+ if (ret < 0)
+ goto exit_unlock;
+ }
+
+ kdbus_name_entry_replace_owner(e, conn, flags);
+ } else if (flags & KDBUS_NAME_QUEUE) {
+ /* add to waiting-queue of the name */
+
+ ret = kdbus_name_pending_new(e, conn, flags);
+ if (ret >= 0)
+ /* tell the caller that we queued it */
+ rflags |= KDBUS_NAME_IN_QUEUE;
+ } else {
+ /* the name is busy, return a failure */
+ ret = -EEXIST;
+ }
+
+ if (ret == 0 && return_flags)
+ *return_flags = rflags;
+
+exit_unlock:
+ up_write(&reg->rwlock);
+ kdbus_notify_flush(conn->ep->bus);
+ return ret;
+}
+
+static void kdbus_name_release_unlocked(struct kdbus_name_registry *reg,
+ struct kdbus_name_entry *e)
+{
+ struct kdbus_name_pending *p;
+
+ lockdep_assert_held(&reg->rwlock);
+
+ p = list_first_entry_or_null(&e->queue, struct kdbus_name_pending,
+ name_entry);
+
+ if (p) {
+ /* give it to first active waiter in the queue */
+ kdbus_name_entry_replace_owner(e, p->conn, p->flags);
+ kdbus_name_pending_free(p);
+ } else if (e->activator && e->activator != e->conn) {
+ /* hand it back to an active activator connection */
+ kdbus_conn_move_messages(e->activator, e->conn, e->name_id);
+ kdbus_name_entry_replace_owner(e, e->activator,
+ KDBUS_NAME_ACTIVATOR);
+ } else {
+ /* release the name */
+ kdbus_notify_name_change(e->conn->ep->bus,
+ KDBUS_ITEM_NAME_REMOVE,
+ e->conn->id, 0, e->flags, 0, e->name);
+ kdbus_name_entry_remove_owner(e);
+ kdbus_name_entry_free(e);
+ }
+}
+
+static int kdbus_name_release(struct kdbus_name_registry *reg,
+ struct kdbus_conn *conn,
+ const char *name)
+{
+ struct kdbus_name_pending *p;
+ struct kdbus_name_entry *e;
+ int ret = 0;
+
+ down_write(&reg->rwlock);
+ e = kdbus_name_find(reg, kdbus_strhash(name), name);
+ if (!e) {
+ ret = -ESRCH;
+ } else if (e->conn == conn) {
+ kdbus_name_release_unlocked(reg, e);
+ } else {
+ ret = -EADDRINUSE;
+ list_for_each_entry(p, &e->queue, name_entry) {
+ if (p->conn == conn) {
+ kdbus_name_pending_free(p);
+ ret = 0;
+ break;
+ }
+ }
+ }
+ up_write(&reg->rwlock);
+
+ kdbus_notify_flush(conn->ep->bus);
+ return ret;
+}
+
+/**
+ * kdbus_name_release_all() - remove all name entries of a given connection
+ * @reg: name registry
+ * @conn: connection
+ */
+void kdbus_name_release_all(struct kdbus_name_registry *reg,
+ struct kdbus_conn *conn)
+{
+ struct kdbus_name_pending *p;
+ struct kdbus_conn *activator = NULL;
+ struct kdbus_name_entry *e;
+
+ down_write(&reg->rwlock);
+
+ if (conn->activator_of) {
+ activator = conn->activator_of->activator;
+ conn->activator_of->activator = NULL;
+ }
+
+ while ((p = list_first_entry_or_null(&conn->names_queue_list,
+ struct kdbus_name_pending,
+ conn_entry)))
+ kdbus_name_pending_free(p);
+ while ((e = list_first_entry_or_null(&conn->names_list,
+ struct kdbus_name_entry,
+ conn_entry)))
+ kdbus_name_release_unlocked(reg, e);
+
+ up_write(&reg->rwlock);
+
+ kdbus_conn_unref(activator);
+ kdbus_notify_flush(conn->ep->bus);
+}
+
+/**
+ * kdbus_cmd_name_acquire() - handle KDBUS_CMD_NAME_ACQUIRE
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_name_acquire(struct kdbus_conn *conn, void __user *argp)
+{
+ const char *item_name;
+ struct kdbus_cmd *cmd;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_NAME, .mandatory = true },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE |
+ KDBUS_NAME_REPLACE_EXISTING |
+ KDBUS_NAME_ALLOW_REPLACEMENT |
+ KDBUS_NAME_QUEUE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ if (!kdbus_conn_is_ordinary(conn))
+ return -EOPNOTSUPP;
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ item_name = argv[1].item->str;
+ if (!kdbus_name_is_valid(item_name, false)) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Do atomic_inc_return here to reserve our slot, then decrement
+ * it before returning.
+ */
+ if (atomic_inc_return(&conn->name_count) > KDBUS_CONN_MAX_NAMES) {
+ ret = -E2BIG;
+ goto exit_dec;
+ }
+
+ ret = kdbus_name_acquire(conn->ep->bus->name_registry, conn, item_name,
+ cmd->flags, &cmd->return_flags);
+
+exit_dec:
+ atomic_dec(&conn->name_count);
+exit:
+ return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_name_release() - handle KDBUS_CMD_NAME_RELEASE
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_name_release(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_cmd *cmd;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ { .type = KDBUS_ITEM_NAME, .mandatory = true },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ if (!kdbus_conn_is_ordinary(conn))
+ return -EOPNOTSUPP;
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ ret = kdbus_name_release(conn->ep->bus->name_registry, conn,
+ argv[1].item->str);
+ return kdbus_args_clear(&args, ret);
+}
+
+static int kdbus_list_write(struct kdbus_conn *conn,
+ struct kdbus_conn *c,
+ struct kdbus_pool_slice *slice,
+ size_t *pos,
+ struct kdbus_name_entry *e,
+ bool write)
+{
+ struct kvec kvec[4];
+ size_t cnt = 0;
+ int ret;
+
+ /* info header */
+ struct kdbus_info info = {
+ .size = 0,
+ .id = c->id,
+ .flags = c->flags,
+ };
+
+ /* fake the header of a kdbus_name item */
+ struct {
+ u64 size;
+ u64 type;
+ u64 flags;
+ } h = {};
+
+ if (e && !kdbus_conn_policy_see_name_unlocked(conn, current_cred(),
+ e->name))
+ return 0;
+
+ kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &info.size);
+
+ /* append name */
+ if (e) {
+ size_t slen = strlen(e->name) + 1;
+
+ h.size = offsetof(struct kdbus_item, name.name) + slen;
+ h.type = KDBUS_ITEM_OWNED_NAME;
+ h.flags = e->flags;
+
+ kdbus_kvec_set(&kvec[cnt++], &h, sizeof(h), &info.size);
+ kdbus_kvec_set(&kvec[cnt++], e->name, slen, &info.size);
+ cnt += !!kdbus_kvec_pad(&kvec[cnt], &info.size);
+ }
+
+ if (write) {
+ ret = kdbus_pool_slice_copy_kvec(slice, *pos, kvec,
+ cnt, info.size);
+ if (ret < 0)
+ return ret;
+ }
+
+ *pos += info.size;
+ return 0;
+}
+
+static int kdbus_list_all(struct kdbus_conn *conn, u64 flags,
+ struct kdbus_pool_slice *slice,
+ size_t *pos, bool write)
+{
+ struct kdbus_conn *c;
+ size_t p = *pos;
+ int ret, i;
+
+ hash_for_each(conn->ep->bus->conn_hash, i, c, hentry) {
+ bool added = false;
+
+ /* skip monitors */
+ if (kdbus_conn_is_monitor(c))
+ continue;
+
+ /* skip activators */
+ if (!(flags & KDBUS_LIST_ACTIVATORS) &&
+ kdbus_conn_is_activator(c))
+ continue;
+
+ /* all names the connection owns */
+ if (flags & (KDBUS_LIST_NAMES | KDBUS_LIST_ACTIVATORS)) {
+ struct kdbus_name_entry *e;
+
+ list_for_each_entry(e, &c->names_list, conn_entry) {
+ struct kdbus_conn *a = e->activator;
+
+ if ((flags & KDBUS_LIST_ACTIVATORS) &&
+ a && a != c) {
+ ret = kdbus_list_write(conn, a, slice,
+ &p, e, write);
+ if (ret < 0) {
+ mutex_unlock(&c->lock);
+ return ret;
+ }
+
+ added = true;
+ }
+
+ if (flags & KDBUS_LIST_NAMES ||
+ kdbus_conn_is_activator(c)) {
+ ret = kdbus_list_write(conn, c, slice,
+ &p, e, write);
+ if (ret < 0) {
+ mutex_unlock(&c->lock);
+ return ret;
+ }
+
+ added = true;
+ }
+ }
+ }
+
+ /* queue of names the connection is currently waiting for */
+ if (flags & KDBUS_LIST_QUEUED) {
+ struct kdbus_name_pending *q;
+
+ list_for_each_entry(q, &c->names_queue_list,
+ conn_entry) {
+ ret = kdbus_list_write(conn, c, slice, &p,
+ q->name, write);
+ if (ret < 0) {
+ mutex_unlock(&c->lock);
+ return ret;
+ }
+
+ added = true;
+ }
+ }
+
+ /* nothing added so far, just add the unique ID */
+ if (!added && flags & KDBUS_LIST_UNIQUE) {
+ ret = kdbus_list_write(conn, c, slice, &p, NULL, write);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ *pos = p;
+ return 0;
+}
+
+/**
+ * kdbus_cmd_list() - handle KDBUS_CMD_LIST
+ * @conn: connection to operate on
+ * @argp: command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_list(struct kdbus_conn *conn, void __user *argp)
+{
+ struct kdbus_name_registry *reg = conn->ep->bus->name_registry;
+ struct kdbus_pool_slice *slice = NULL;
+ struct kdbus_cmd_list *cmd;
+ size_t pos, size;
+ int ret;
+
+ struct kdbus_arg argv[] = {
+ { .type = KDBUS_ITEM_NEGOTIATE },
+ };
+ struct kdbus_args args = {
+ .allowed_flags = KDBUS_FLAG_NEGOTIATE |
+ KDBUS_LIST_UNIQUE |
+ KDBUS_LIST_NAMES |
+ KDBUS_LIST_ACTIVATORS |
+ KDBUS_LIST_QUEUED,
+ .argv = argv,
+ .argc = ARRAY_SIZE(argv),
+ };
+
+ ret = kdbus_args_parse(&args, argp, &cmd);
+ if (ret != 0)
+ return ret;
+
+ /* lock order: domain -> bus -> ep -> names -> conn */
+ down_read(&reg->rwlock);
+ down_read(&conn->ep->bus->conn_rwlock);
+ down_read(&conn->ep->policy_db.entries_rwlock);
+
+ /* size of records */
+ size = 0;
+ ret = kdbus_list_all(conn, cmd->flags, NULL, &size, false);
+ if (ret < 0)
+ goto exit_unlock;
+
+ if (size == 0) {
+ kdbus_pool_publish_empty(conn->pool, &cmd->offset,
+ &cmd->list_size);
+ } else {
+ slice = kdbus_pool_slice_alloc(conn->pool, size, false);
+ if (IS_ERR(slice)) {
+ ret = PTR_ERR(slice);
+ slice = NULL;
+ goto exit_unlock;
+ }
+
+ /* copy the records */
+ pos = 0;
+ ret = kdbus_list_all(conn, cmd->flags, slice, &pos, true);
+ if (ret < 0)
+ goto exit_unlock;
+
+ WARN_ON(pos != size);
+ kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->list_size);
+ }
+
+ if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) ||
+ kdbus_member_set_user(&cmd->list_size, argp,
+ typeof(*cmd), list_size))
+ ret = -EFAULT;
+
+exit_unlock:
+ up_read(&conn->ep->policy_db.entries_rwlock);
+ up_read(&conn->ep->bus->conn_rwlock);
+ up_read(&reg->rwlock);
+ kdbus_pool_slice_release(slice);
+ return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/names.h b/ipc/kdbus/names.h
new file mode 100644
index 000000000..3dd258929
--- /dev/null
+++ b/ipc/kdbus/names.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_NAMES_H
+#define __KDBUS_NAMES_H
+
+#include <linux/hashtable.h>
+#include <linux/rwsem.h>
+
+/**
+ * struct kdbus_name_registry - names registered for a bus
+ * @entries_hash: Map of entries
+ * @lock: Registry data lock
+ * @name_seq_last: Last used sequence number to assign to a name entry
+ */
+struct kdbus_name_registry {
+ DECLARE_HASHTABLE(entries_hash, 8);
+ struct rw_semaphore rwlock;
+ u64 name_seq_last;
+};
+
+/**
+ * struct kdbus_name_entry - well-know name entry
+ * @name_id: Sequence number of name entry to be able to uniquely
+ * identify a name over its registration lifetime
+ * @flags: KDBUS_NAME_* flags
+ * @conn: Connection owning the name
+ * @activator: Connection of the activator queuing incoming messages
+ * @queue: List of queued connections
+ * @conn_entry: Entry in connection
+ * @hentry: Entry in registry map
+ * @name: The well-known name
+ */
+struct kdbus_name_entry {
+ u64 name_id;
+ u64 flags;
+ struct kdbus_conn *conn;
+ struct kdbus_conn *activator;
+ struct list_head queue;
+ struct list_head conn_entry;
+ struct hlist_node hentry;
+ char name[];
+};
+
+bool kdbus_name_is_valid(const char *p, bool allow_wildcard);
+
+struct kdbus_name_registry *kdbus_name_registry_new(void);
+void kdbus_name_registry_free(struct kdbus_name_registry *reg);
+
+struct kdbus_name_entry *
+kdbus_name_lookup_unlocked(struct kdbus_name_registry *reg, const char *name);
+
+int kdbus_name_acquire(struct kdbus_name_registry *reg,
+ struct kdbus_conn *conn, const char *name,
+ u64 flags, u64 *return_flags);
+void kdbus_name_release_all(struct kdbus_name_registry *reg,
+ struct kdbus_conn *conn);
+
+int kdbus_cmd_name_acquire(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_name_release(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_list(struct kdbus_conn *conn, void __user *argp);
+
+#endif
diff --git a/ipc/kdbus/node.c b/ipc/kdbus/node.c
new file mode 100644
index 000000000..89f58bc85
--- /dev/null
+++ b/ipc/kdbus/node.c
@@ -0,0 +1,897 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/atomic.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/kdev_t.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include "bus.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "fs.h"
+#include "handle.h"
+#include "node.h"
+#include "util.h"
+
+/**
+ * DOC: kdbus nodes
+ *
+ * Nodes unify lifetime management across exposed kdbus objects and provide a
+ * hierarchy. Each kdbus object, that might be exposed to user-space, has a
+ * kdbus_node object embedded and is linked into the hierarchy. Each node can
+ * have any number (0-n) of child nodes linked. Each child retains a reference
+ * to its parent node. For root-nodes, the parent is NULL.
+ *
+ * Each node object goes through a bunch of states during it's lifetime:
+ * * NEW
+ * * LINKED (can be skipped by NEW->FREED transition)
+ * * ACTIVE (can be skipped by LINKED->INACTIVE transition)
+ * * INACTIVE
+ * * DRAINED
+ * * FREED
+ *
+ * Each node is allocated by the caller and initialized via kdbus_node_init().
+ * This never fails and sets the object into state NEW. From now on, ref-counts
+ * on the node manage its lifetime. During init, the ref-count is set to 1. Once
+ * it drops to 0, the node goes to state FREED and the node->free_cb() callback
+ * is called to deallocate any memory.
+ *
+ * After initializing a node, you usually link it into the hierarchy. You need
+ * to provide a parent node and a name. The node will be linked as child to the
+ * parent and a globally unique ID is assigned to the child. The name of the
+ * child must be unique for all children of this parent. Otherwise, linking the
+ * child will fail with -EEXIST.
+ * Note that the child is not marked active, yet. Admittedly, it prevents any
+ * other node from being linked with the same name (thus, it reserves that
+ * name), but any child-lookup (via name or unique ID) will never return this
+ * child unless it has been marked active.
+ *
+ * Once successfully linked, you can use kdbus_node_activate() to activate a
+ * child. This will mark the child active. This state can be skipped by directly
+ * deactivating the child via kdbus_node_deactivate() (see below).
+ * By activating a child, you enable any lookups on this child to succeed from
+ * now on. Furthermore, any code that got its hands on a reference to the node,
+ * can from now on "acquire" the node.
+ *
+ * Active References (or: 'acquiring' and 'releasing' a node)
+ * Additionally to normal object references, nodes support something we call
+ * "active references". An active reference can be acquired via
+ * kdbus_node_acquire() and released via kdbus_node_release(). A caller
+ * _must_ own a normal object reference whenever calling those functions.
+ * Unlike object references, acquiring an active reference can fail (by
+ * returning 'false' from kdbus_node_acquire()). An active reference can
+ * only be acquired if the node is marked active. If it is not marked
+ * active, yet, or if it was already deactivated, no more active references
+ * can be acquired, ever!
+ * Active references are used to track tasks working on a node. Whenever a
+ * task enters kernel-space to perform an action on a node, it acquires an
+ * active reference, performs the action and releases the reference again.
+ * While holding an active reference, the node is guaranteed to stay active.
+ * If the node is deactivated in parallel, the node is marked as
+ * deactivated, then we wait for all active references to be dropped, before
+ * we finally proceed with any cleanups. That is, if you hold an active
+ * reference to a node, any resources that are bound to the "active" state
+ * are guaranteed to stay accessible until you release your reference.
+ *
+ * Active-references are very similar to rw-locks, where acquiring a node is
+ * equal to try-read-lock and releasing to read-unlock. Deactivating a node
+ * means write-lock and never releasing it again.
+ * Unlike rw-locks, the 'active reference' concept is more versatile and
+ * avoids unusual rw-lock usage (never releasing a write-lock..).
+ *
+ * It is safe to acquire multiple active-references recursively. But you
+ * need to check the return value of kdbus_node_acquire() on _each_ call. It
+ * may stop granting references at _any_ time.
+ *
+ * You're free to perform any operations you want while holding an active
+ * reference, except sleeping for an indefinite period. Sleeping for a fixed
+ * amount of time is fine, but you usually should not wait on wait-queues
+ * without a timeout.
+ * For example, if you wait for I/O to happen, you should gather all data
+ * and schedule the I/O operation, then release your active reference and
+ * wait for it to complete. Then try to acquire a new reference. If it
+ * fails, perform any cleanup (the node is now dead). Otherwise, you can
+ * finish your operation.
+ *
+ * All nodes can be deactivated via kdbus_node_deactivate() at any time. You can
+ * call this multiple times, even in parallel or on nodes that were never
+ * linked, and it will just work. The only restriction is, you must not hold an
+ * active reference when calling kdbus_node_deactivate().
+ * By deactivating a node, it is immediately marked inactive. Then, we wait for
+ * all active references to be released (called 'draining' the node). This
+ * shouldn't take very long as we don't perform long-lasting operations while
+ * holding an active reference. Note that once the node is marked inactive, no
+ * new active references can be acquired.
+ * Once all active references are dropped, the node is considered 'drained'. Now
+ * kdbus_node_deactivate() is called on each child of the node before we
+ * continue deactivating our node. That is, once all children are entirely
+ * deactivated, we call ->release_cb() of our node. ->release_cb() can release
+ * any resources on that node which are bound to the "active" state of a node.
+ * When done, we unlink the node from its parent rb-tree, mark it as
+ * 'released' and return.
+ * If kdbus_node_deactivate() is called multiple times (even in parallel), all
+ * but one caller will just wait until the node is fully deactivated. That is,
+ * one random caller of kdbus_node_deactivate() is selected to call
+ * ->release_cb() and cleanup the node. Only once all this is done, all other
+ * callers will return from kdbus_node_deactivate(). That is, it doesn't matter
+ * whether you're the selected caller or not, it will only return after
+ * everything is fully done.
+ *
+ * When a node is activated, we acquire a normal object reference to the node.
+ * This reference is dropped after deactivation is fully done (and only iff the
+ * node really was activated). This allows callers to link+activate a child node
+ * and then drop all refs. The node will be deactivated together with the
+ * parent, and then be freed when this reference is dropped.
+ *
+ * Currently, nodes provide a bunch of resources that external code can use
+ * directly. This includes:
+ *
+ * * node->waitq: Each node has its own wait-queue that is used to manage
+ * the 'active' state. When a node is deactivated, we wait on
+ * this queue until all active refs are dropped. Analogously,
+ * when you release an active reference on a deactivated
+ * node, and the active ref-count drops to 0, we wake up a
+ * single thread on this queue. Furthermore, once the
+ * ->release_cb() callback finished, we wake up all waiters.
+ * The node-owner is free to re-use this wait-queue for other
+ * purposes. As node-management uses this queue only during
+ * deactivation, it is usually totally fine to re-use the
+ * queue for other, preferably low-overhead, use-cases.
+ *
+ * * node->type: This field defines the type of the owner of this node. It
+ * must be set during node initialization and must remain
+ * constant. The node management never looks at this value,
+ * but external users might use to gain access to the owner
+ * object of a node.
+ * It is totally up to the owner of the node to define what
+ * their type means. Usually it means you can access the
+ * parent structure via container_of(), as long as you hold an
+ * active reference to the node.
+ *
+ * * node->free_cb: callback after all references are dropped
+ * node->release_cb: callback during node deactivation
+ * These fields must be set by the node owner during
+ * node initialization. They must remain constant. If
+ * NULL, they're skipped.
+ *
+ * * node->mode: filesystem access modes
+ * node->uid: filesystem owner uid
+ * node->gid: filesystem owner gid
+ * These fields must be set by the node owner during node
+ * initialization. They must remain constant and may be
+ * accessed by other callers to properly initialize
+ * filesystem nodes.
+ *
+ * * node->id: This is an unsigned 32bit integer allocated by an IDA. It is
+ * always kept as small as possible during allocation and is
+ * globally unique across all nodes allocated by this module. 0
+ * is reserved as "not assigned" and is the default.
+ * The ID is assigned during kdbus_node_link() and is kept until
+ * the object is freed. Thus, the ID surpasses the active
+ * lifetime of a node. As long as you hold an object reference
+ * to a node (and the node was linked once), the ID is valid and
+ * unique.
+ *
+ * * node->name: name of this node
+ * node->hash: 31bit hash-value of @name (range [2..INT_MAX-1])
+ * These values follow the same lifetime rules as node->id.
+ * They're initialized when the node is linked and then remain
+ * constant until the last object reference is dropped.
+ * Unlike the id, the name is only unique across all siblings
+ * and only until the node is deactivated. Currently, the name
+ * is even unique if linked but not activated, yet. This might
+ * change in the future, though. Code should not rely on this.
+ *
+ * * node->lock: lock to protect node->children, node->rb, node->parent
+ * * node->parent: Reference to parent node. This is set during LINK time
+ * and is dropped during destruction. You must not access
+ * it unless you hold an active reference to the node or if
+ * you know the node is dead.
+ * * node->children: rb-tree of all linked children of this node. You must
+ * not access this directly, but use one of the iterator
+ * or lookup helpers.
+ */
+
+/*
+ * Bias values track states of "active references". They're all negative. If a
+ * node is active, its active-ref-counter is >=0 and tracks all active
+ * references. Once a node is deactivaed, we subtract NODE_BIAS. This means, the
+ * counter is now negative but still counts the active references. Once it drops
+ * to exactly NODE_BIAS, we know all active references were dropped. Exactly one
+ * thread will change it to NODE_RELEASE now, perform cleanup and then put it
+ * into NODE_DRAINED. Once drained, all other threads that tried deactivating
+ * the node will now be woken up (thus, they wait until the node is fully done).
+ * The initial state during node-setup is NODE_NEW. If a node is directly
+ * deactivated without having ever been active, it is put into
+ * NODE_RELEASE_DIRECT instead of NODE_BIAS. This tracks this one-bit state
+ * across node-deactivation. The task putting it into NODE_RELEASE now knows
+ * whether the node was active before or not.
+ *
+ * Some archs implement atomic_sub(v) with atomic_add(-v), so reserve INT_MIN
+ * to avoid overflows if multiplied by -1.
+ */
+#define KDBUS_NODE_BIAS (INT_MIN + 5)
+#define KDBUS_NODE_RELEASE_DIRECT (KDBUS_NODE_BIAS - 1)
+#define KDBUS_NODE_RELEASE (KDBUS_NODE_BIAS - 2)
+#define KDBUS_NODE_DRAINED (KDBUS_NODE_BIAS - 3)
+#define KDBUS_NODE_NEW (KDBUS_NODE_BIAS - 4)
+
+/* global unique ID mapping for kdbus nodes */
+DEFINE_IDA(kdbus_node_ida);
+
+/**
+ * kdbus_node_name_hash() - hash a name
+ * @name: The string to hash
+ *
+ * This computes the hash of @name. It is guaranteed to be in the range
+ * [2..INT_MAX-1]. The values 1, 2 and INT_MAX are unused as they are reserved
+ * for the filesystem code.
+ *
+ * Return: hash value of the passed string
+ */
+static unsigned int kdbus_node_name_hash(const char *name)
+{
+ unsigned int hash;
+
+ /* reserve hash numbers 0, 1 and >=INT_MAX for magic directories */
+ hash = kdbus_strhash(name) & INT_MAX;
+ if (hash < 2)
+ hash += 2;
+ if (hash >= INT_MAX)
+ hash = INT_MAX - 1;
+
+ return hash;
+}
+
+/**
+ * kdbus_node_name_compare() - compare a name with a node's name
+ * @hash: hash of the string to compare the node with
+ * @name: name to compare the node with
+ * @node: node to compare the name with
+ *
+ * Return: 0 if @name and @hash exactly match the information in @node, or
+ * an integer less than or greater than zero if @name is found, respectively,
+ * to be less than or be greater than the string stored in @node.
+ */
+static int kdbus_node_name_compare(unsigned int hash, const char *name,
+ const struct kdbus_node *node)
+{
+ if (hash != node->hash)
+ return hash - node->hash;
+
+ return strcmp(name, node->name);
+}
+
+/**
+ * kdbus_node_init() - initialize a kdbus_node
+ * @node: Pointer to the node to initialize
+ * @type: The type the node will have (KDBUS_NODE_*)
+ *
+ * The caller is responsible of allocating @node and initializating it to zero.
+ * Once this call returns, you must use the node_ref() and node_unref()
+ * functions to manage this node.
+ */
+void kdbus_node_init(struct kdbus_node *node, unsigned int type)
+{
+ atomic_set(&node->refcnt, 1);
+ mutex_init(&node->lock);
+ node->id = 0;
+ node->type = type;
+ RB_CLEAR_NODE(&node->rb);
+ node->children = RB_ROOT;
+ init_waitqueue_head(&node->waitq);
+ atomic_set(&node->active, KDBUS_NODE_NEW);
+}
+
+/**
+ * kdbus_node_link() - link a node into the nodes system
+ * @node: Pointer to the node to initialize
+ * @parent: Pointer to a parent node, may be %NULL
+ * @name: The name of the node (or NULL if root node)
+ *
+ * This links a node into the hierarchy. This must not be called multiple times.
+ * If @parent is NULL, the node becomes a new root node.
+ *
+ * This call will fail if @name is not unique across all its siblings or if no
+ * ID could be allocated. You must not activate a node if linking failed! It is
+ * safe to deactivate it, though.
+ *
+ * Once you linked a node, you must call kdbus_node_deactivate() before you drop
+ * the last reference (even if you never activate the node).
+ *
+ * Return: 0 on success. negative error otherwise.
+ */
+int kdbus_node_link(struct kdbus_node *node, struct kdbus_node *parent,
+ const char *name)
+{
+ int ret;
+
+ if (WARN_ON(node->type != KDBUS_NODE_DOMAIN && !parent))
+ return -EINVAL;
+
+ if (WARN_ON(parent && !name))
+ return -EINVAL;
+
+ if (name) {
+ node->name = kstrdup(name, GFP_KERNEL);
+ if (!node->name)
+ return -ENOMEM;
+
+ node->hash = kdbus_node_name_hash(name);
+ }
+
+ ret = ida_simple_get(&kdbus_node_ida, 1, 0, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+
+ node->id = ret;
+ ret = 0;
+
+ if (parent) {
+ struct rb_node **n, *prev;
+
+ if (!kdbus_node_acquire(parent))
+ return -ESHUTDOWN;
+
+ mutex_lock(&parent->lock);
+
+ n = &parent->children.rb_node;
+ prev = NULL;
+
+ while (*n) {
+ struct kdbus_node *pos;
+ int result;
+
+ pos = kdbus_node_from_rb(*n);
+ prev = *n;
+ result = kdbus_node_name_compare(node->hash,
+ node->name,
+ pos);
+ if (result == 0) {
+ ret = -EEXIST;
+ goto exit_unlock;
+ }
+
+ if (result < 0)
+ n = &pos->rb.rb_left;
+ else
+ n = &pos->rb.rb_right;
+ }
+
+ /* add new node and rebalance the tree */
+ rb_link_node(&node->rb, prev, n);
+ rb_insert_color(&node->rb, &parent->children);
+ node->parent = kdbus_node_ref(parent);
+
+exit_unlock:
+ mutex_unlock(&parent->lock);
+ kdbus_node_release(parent);
+ }
+
+ return ret;
+}
+
+/**
+ * kdbus_node_ref() - Acquire object reference
+ * @node: node to acquire reference to (or NULL)
+ *
+ * This acquires a new reference to @node. You must already own a reference when
+ * calling this!
+ * If @node is NULL, this is a no-op.
+ *
+ * Return: @node is returned
+ */
+struct kdbus_node *kdbus_node_ref(struct kdbus_node *node)
+{
+ if (node)
+ atomic_inc(&node->refcnt);
+ return node;
+}
+
+/**
+ * kdbus_node_unref() - Drop object reference
+ * @node: node to drop reference to (or NULL)
+ *
+ * This drops an object reference to @node. You must not access the node if you
+ * no longer own a reference.
+ * If the ref-count drops to 0, the object will be destroyed (->free_cb will be
+ * called).
+ *
+ * If you linked or activated the node, you must deactivate the node before you
+ * drop your last reference! If you didn't link or activate the node, you can
+ * drop any reference you want.
+ *
+ * Note that this calls into ->free_cb() and thus _might_ sleep. The ->free_cb()
+ * callbacks must not acquire any outer locks, though. So you can safely drop
+ * references while holding locks.
+ *
+ * If @node is NULL, this is a no-op.
+ *
+ * Return: This always returns NULL
+ */
+struct kdbus_node *kdbus_node_unref(struct kdbus_node *node)
+{
+ if (node && atomic_dec_and_test(&node->refcnt)) {
+ struct kdbus_node safe = *node;
+
+ WARN_ON(atomic_read(&node->active) != KDBUS_NODE_DRAINED);
+ WARN_ON(!RB_EMPTY_NODE(&node->rb));
+
+ if (node->free_cb)
+ node->free_cb(node);
+ if (safe.id > 0)
+ ida_simple_remove(&kdbus_node_ida, safe.id);
+
+ kfree(safe.name);
+
+ /*
+ * kdbusfs relies on the parent to be available even after the
+ * node was deactivated and unlinked. Therefore, we pin it
+ * until a node is destroyed.
+ */
+ kdbus_node_unref(safe.parent);
+ }
+
+ return NULL;
+}
+
+/**
+ * kdbus_node_is_active() - test whether a node is active
+ * @node: node to test
+ *
+ * This checks whether @node is active. That means, @node was linked and
+ * activated by the node owner and hasn't been deactivated, yet. If, and only
+ * if, a node is active, kdbus_node_acquire() will be able to acquire active
+ * references.
+ *
+ * Note that this function does not give any lifetime guarantees. After this
+ * call returns, the node might be deactivated immediately. Normally, what you
+ * want is to acquire a real active reference via kdbus_node_acquire().
+ *
+ * Return: true if @node is active, false otherwise
+ */
+bool kdbus_node_is_active(struct kdbus_node *node)
+{
+ return atomic_read(&node->active) >= 0;
+}
+
+/**
+ * kdbus_node_is_deactivated() - test whether a node was already deactivated
+ * @node: node to test
+ *
+ * This checks whether kdbus_node_deactivate() was called on @node. Note that
+ * this might be true even if you never deactivated the node directly, but only
+ * one of its ancestors.
+ *
+ * Note that even if this returns 'false', the node might get deactivated
+ * immediately after the call returns.
+ *
+ * Return: true if @node was already deactivated, false if not
+ */
+bool kdbus_node_is_deactivated(struct kdbus_node *node)
+{
+ int v;
+
+ v = atomic_read(&node->active);
+ return v != KDBUS_NODE_NEW && v < 0;
+}
+
+/**
+ * kdbus_node_activate() - activate a node
+ * @node: node to activate
+ *
+ * This marks @node as active if, and only if, the node wasn't activated nor
+ * deactivated, yet, and the parent is still active. Any but the first call to
+ * kdbus_node_activate() is a no-op.
+ * If you called kdbus_node_deactivate() before, then even the first call to
+ * kdbus_node_activate() will be a no-op.
+ *
+ * This call doesn't give any lifetime guarantees. The node might get
+ * deactivated immediately after this call returns. Or the parent might already
+ * be deactivated, which will make this call a no-op.
+ *
+ * If this call successfully activated a node, it will take an object reference
+ * to it. This reference is dropped after the node is deactivated. Therefore,
+ * the object owner can safely drop their reference to @node iff they know that
+ * its parent node will get deactivated at some point. Once the parent node is
+ * deactivated, it will deactivate all its child and thus drop this reference
+ * again.
+ *
+ * Return: True if this call successfully activated the node, otherwise false.
+ * Note that this might return false, even if the node is still active
+ * (eg., if you called this a second time).
+ */
+bool kdbus_node_activate(struct kdbus_node *node)
+{
+ bool res = false;
+
+ mutex_lock(&node->lock);
+ if (atomic_read(&node->active) == KDBUS_NODE_NEW) {
+ atomic_sub(KDBUS_NODE_NEW, &node->active);
+ /* activated nodes have ref +1 */
+ kdbus_node_ref(node);
+ res = true;
+ }
+ mutex_unlock(&node->lock);
+
+ return res;
+}
+
+/**
+ * kdbus_node_deactivate() - deactivate a node
+ * @node: The node to deactivate.
+ *
+ * This function recursively deactivates this node and all its children. It
+ * returns only once all children and the node itself were recursively disabled
+ * (even if you call this function multiple times in parallel).
+ *
+ * It is safe to call this function on _any_ node that was initialized _any_
+ * number of times.
+ *
+ * This call may sleep, as it waits for all active references to be dropped.
+ */
+void kdbus_node_deactivate(struct kdbus_node *node)
+{
+ struct kdbus_node *pos, *child;
+ struct rb_node *rb;
+ int v_pre, v_post;
+
+ pos = node;
+
+ /*
+ * To avoid recursion, we perform back-tracking while deactivating
+ * nodes. For each node we enter, we first mark the active-counter as
+ * deactivated by adding BIAS. If the node as children, we set the first
+ * child as current position and start over. If the node has no
+ * children, we drain the node by waiting for all active refs to be
+ * dropped and then releasing the node.
+ *
+ * After the node is released, we set its parent as current position
+ * and start over. If the current position was the initial node, we're
+ * done.
+ *
+ * Note that this function can be called in parallel by multiple
+ * callers. We make sure that each node is only released once, and any
+ * racing caller will wait until the other thread fully released that
+ * node.
+ */
+
+ for (;;) {
+ /*
+ * Add BIAS to node->active to mark it as inactive. If it was
+ * never active before, immediately mark it as RELEASE_INACTIVE
+ * so we remember this state.
+ * We cannot remember v_pre as we might iterate into the
+ * children, overwriting v_pre, before we can release our node.
+ */
+ mutex_lock(&pos->lock);
+ v_pre = atomic_read(&pos->active);
+ if (v_pre >= 0)
+ atomic_add_return(KDBUS_NODE_BIAS, &pos->active);
+ else if (v_pre == KDBUS_NODE_NEW)
+ atomic_set(&pos->active, KDBUS_NODE_RELEASE_DIRECT);
+ mutex_unlock(&pos->lock);
+
+ /* wait until all active references were dropped */
+ wait_event(pos->waitq,
+ atomic_read(&pos->active) <= KDBUS_NODE_BIAS);
+
+ mutex_lock(&pos->lock);
+ /* recurse into first child if any */
+ rb = rb_first(&pos->children);
+ if (rb) {
+ child = kdbus_node_ref(kdbus_node_from_rb(rb));
+ mutex_unlock(&pos->lock);
+ pos = child;
+ continue;
+ }
+
+ /* mark object as RELEASE */
+ v_post = atomic_read(&pos->active);
+ if (v_post == KDBUS_NODE_BIAS ||
+ v_post == KDBUS_NODE_RELEASE_DIRECT)
+ atomic_set(&pos->active, KDBUS_NODE_RELEASE);
+ mutex_unlock(&pos->lock);
+
+ /*
+ * If this is the thread that marked the object as RELEASE, we
+ * perform the actual release. Otherwise, we wait until the
+ * release is done and the node is marked as DRAINED.
+ */
+ if (v_post == KDBUS_NODE_BIAS ||
+ v_post == KDBUS_NODE_RELEASE_DIRECT) {
+ if (pos->release_cb)
+ pos->release_cb(pos, v_post == KDBUS_NODE_BIAS);
+
+ if (pos->parent) {
+ mutex_lock(&pos->parent->lock);
+ if (!RB_EMPTY_NODE(&pos->rb)) {
+ rb_erase(&pos->rb,
+ &pos->parent->children);
+ RB_CLEAR_NODE(&pos->rb);
+ }
+ mutex_unlock(&pos->parent->lock);
+ }
+
+ /* mark as DRAINED */
+ atomic_set(&pos->active, KDBUS_NODE_DRAINED);
+ wake_up_all(&pos->waitq);
+
+ /* drop VFS cache */
+ kdbus_fs_flush(pos);
+
+ /*
+ * If the node was activated and someone subtracted BIAS
+ * from it to deactivate it, we, and only us, are
+ * responsible to release the extra ref-count that was
+ * taken once in kdbus_node_activate().
+ * If the node was never activated, no-one ever
+ * subtracted BIAS, but instead skipped that state and
+ * immediately went to NODE_RELEASE_DIRECT. In that case
+ * we must not drop the reference.
+ */
+ if (v_post == KDBUS_NODE_BIAS)
+ kdbus_node_unref(pos);
+ } else {
+ /* wait until object is DRAINED */
+ wait_event(pos->waitq,
+ atomic_read(&pos->active) == KDBUS_NODE_DRAINED);
+ }
+
+ /*
+ * We're done with the current node. Continue on its parent
+ * again, which will try deactivating its next child, or itself
+ * if no child is left.
+ * If we've reached our initial node again, we are done and
+ * can safely return.
+ */
+ if (pos == node)
+ break;
+
+ child = pos;
+ pos = pos->parent;
+ kdbus_node_unref(child);
+ }
+}
+
+/**
+ * kdbus_node_acquire() - Acquire an active ref on a node
+ * @node: The node
+ *
+ * This acquires an active-reference to @node. This will only succeed if the
+ * node is active. You must release this active reference via
+ * kdbus_node_release() again.
+ *
+ * See the introduction to "active references" for more details.
+ *
+ * Return: %true if @node was non-NULL and active
+ */
+bool kdbus_node_acquire(struct kdbus_node *node)
+{
+ return node && atomic_inc_unless_negative(&node->active);
+}
+
+/**
+ * kdbus_node_release() - Release an active ref on a node
+ * @node: The node
+ *
+ * This releases an active reference that was previously acquired via
+ * kdbus_node_acquire(). See kdbus_node_acquire() for details.
+ */
+void kdbus_node_release(struct kdbus_node *node)
+{
+ if (node && atomic_dec_return(&node->active) == KDBUS_NODE_BIAS)
+ wake_up(&node->waitq);
+}
+
+/**
+ * kdbus_node_find_child() - Find child by name
+ * @node: parent node to search through
+ * @name: name of child node
+ *
+ * This searches through all children of @node for a child-node with name @name.
+ * If not found, or if the child is deactivated, NULL is returned. Otherwise,
+ * the child is acquired and a new reference is returned.
+ *
+ * If you're done with the child, you need to release it and drop your
+ * reference.
+ *
+ * This function does not acquire the parent node. However, if the parent was
+ * already deactivated, then kdbus_node_deactivate() will, at some point, also
+ * deactivate the child. Therefore, we can rely on the explicit ordering during
+ * deactivation.
+ *
+ * Return: Reference to acquired child node, or NULL if not found / not active.
+ */
+struct kdbus_node *kdbus_node_find_child(struct kdbus_node *node,
+ const char *name)
+{
+ struct kdbus_node *child;
+ struct rb_node *rb;
+ unsigned int hash;
+ int ret;
+
+ hash = kdbus_node_name_hash(name);
+
+ mutex_lock(&node->lock);
+ rb = node->children.rb_node;
+ while (rb) {
+ child = kdbus_node_from_rb(rb);
+ ret = kdbus_node_name_compare(hash, name, child);
+ if (ret < 0)
+ rb = rb->rb_left;
+ else if (ret > 0)
+ rb = rb->rb_right;
+ else
+ break;
+ }
+ if (rb && kdbus_node_acquire(child))
+ kdbus_node_ref(child);
+ else
+ child = NULL;
+ mutex_unlock(&node->lock);
+
+ return child;
+}
+
+static struct kdbus_node *node_find_closest_unlocked(struct kdbus_node *node,
+ unsigned int hash,
+ const char *name)
+{
+ struct kdbus_node *n, *pos = NULL;
+ struct rb_node *rb;
+ int res;
+
+ /*
+ * Find the closest child with ``node->hash >= hash'', or, if @name is
+ * valid, ``node->name >= name'' (where '>=' is the lex. order).
+ */
+
+ rb = node->children.rb_node;
+ while (rb) {
+ n = kdbus_node_from_rb(rb);
+
+ if (name)
+ res = kdbus_node_name_compare(hash, name, n);
+ else
+ res = hash - n->hash;
+
+ if (res <= 0) {
+ rb = rb->rb_left;
+ pos = n;
+ } else { /* ``hash > n->hash'', ``name > n->name'' */
+ rb = rb->rb_right;
+ }
+ }
+
+ return pos;
+}
+
+/**
+ * kdbus_node_find_closest() - Find closest child-match
+ * @node: parent node to search through
+ * @hash: hash value to find closest match for
+ *
+ * Find the closest child of @node with a hash greater than or equal to @hash.
+ * The closest match is the left-most child of @node with this property. Which
+ * means, it is the first child with that hash returned by
+ * kdbus_node_next_child(), if you'd iterate the whole parent node.
+ *
+ * Return: Reference to acquired child, or NULL if none found.
+ */
+struct kdbus_node *kdbus_node_find_closest(struct kdbus_node *node,
+ unsigned int hash)
+{
+ struct kdbus_node *child;
+ struct rb_node *rb;
+
+ mutex_lock(&node->lock);
+
+ child = node_find_closest_unlocked(node, hash, NULL);
+ while (child && !kdbus_node_acquire(child)) {
+ rb = rb_next(&child->rb);
+ if (rb)
+ child = kdbus_node_from_rb(rb);
+ else
+ child = NULL;
+ }
+ kdbus_node_ref(child);
+
+ mutex_unlock(&node->lock);
+
+ return child;
+}
+
+/**
+ * kdbus_node_next_child() - Acquire next child
+ * @node: parent node
+ * @prev: previous child-node position or NULL
+ *
+ * This function returns a reference to the next active child of @node, after
+ * the passed position @prev. If @prev is NULL, a reference to the first active
+ * child is returned. If no more active children are found, NULL is returned.
+ *
+ * This function acquires the next child it returns. If you're done with the
+ * returned pointer, you need to release _and_ unref it.
+ *
+ * The passed in pointer @prev is not modified by this function, and it does
+ * *not* have to be active. If @prev was acquired via different means, or if it
+ * was unlinked from its parent before you pass it in, then this iterator will
+ * still return the next active child (it will have to search through the
+ * rb-tree based on the node-name, though).
+ * However, @prev must not be linked to a different parent than @node!
+ *
+ * Return: Reference to next acquired child, or NULL if at the end.
+ */
+struct kdbus_node *kdbus_node_next_child(struct kdbus_node *node,
+ struct kdbus_node *prev)
+{
+ struct kdbus_node *pos = NULL;
+ struct rb_node *rb;
+
+ mutex_lock(&node->lock);
+
+ if (!prev) {
+ /*
+ * New iteration; find first node in rb-tree and try to acquire
+ * it. If we got it, directly return it as first element.
+ * Otherwise, the loop below will find the next active node.
+ */
+ rb = rb_first(&node->children);
+ if (!rb)
+ goto exit;
+ pos = kdbus_node_from_rb(rb);
+ if (kdbus_node_acquire(pos))
+ goto exit;
+ } else if (RB_EMPTY_NODE(&prev->rb)) {
+ /*
+ * The current iterator is no longer linked to the rb-tree. Use
+ * its hash value and name to find the next _higher_ node and
+ * acquire it. If we got it, return it as next element.
+ * Otherwise, the loop below will find the next active node.
+ */
+ pos = node_find_closest_unlocked(node, prev->hash, prev->name);
+ if (!pos)
+ goto exit;
+ if (kdbus_node_acquire(pos))
+ goto exit;
+ } else {
+ /*
+ * The current iterator is still linked to the parent. Set it
+ * as current position and use the loop below to find the next
+ * active element.
+ */
+ pos = prev;
+ }
+
+ /* @pos was already returned or is inactive; find next active node */
+ do {
+ rb = rb_next(&pos->rb);
+ if (rb)
+ pos = kdbus_node_from_rb(rb);
+ else
+ pos = NULL;
+ } while (pos && !kdbus_node_acquire(pos));
+
+exit:
+ /* @pos is NULL or acquired. Take ref if non-NULL and return it */
+ kdbus_node_ref(pos);
+ mutex_unlock(&node->lock);
+ return pos;
+}
diff --git a/ipc/kdbus/node.h b/ipc/kdbus/node.h
new file mode 100644
index 000000000..970e02b08
--- /dev/null
+++ b/ipc/kdbus/node.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_NODE_H
+#define __KDBUS_NODE_H
+
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+
+struct kdbus_node;
+
+enum kdbus_node_type {
+ KDBUS_NODE_DOMAIN,
+ KDBUS_NODE_CONTROL,
+ KDBUS_NODE_BUS,
+ KDBUS_NODE_ENDPOINT,
+};
+
+typedef void (*kdbus_node_free_t) (struct kdbus_node *node);
+typedef void (*kdbus_node_release_t) (struct kdbus_node *node, bool was_active);
+
+struct kdbus_node {
+ atomic_t refcnt;
+ atomic_t active;
+ wait_queue_head_t waitq;
+
+ /* static members */
+ unsigned int type;
+ kdbus_node_free_t free_cb;
+ kdbus_node_release_t release_cb;
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+
+ /* valid once linked */
+ char *name;
+ unsigned int hash;
+ unsigned int id;
+ struct kdbus_node *parent; /* may be NULL */
+
+ /* valid iff active */
+ struct mutex lock;
+ struct rb_node rb;
+ struct rb_root children;
+};
+
+#define kdbus_node_from_rb(_node) rb_entry((_node), struct kdbus_node, rb)
+
+extern struct ida kdbus_node_ida;
+
+void kdbus_node_init(struct kdbus_node *node, unsigned int type);
+
+int kdbus_node_link(struct kdbus_node *node, struct kdbus_node *parent,
+ const char *name);
+
+struct kdbus_node *kdbus_node_ref(struct kdbus_node *node);
+struct kdbus_node *kdbus_node_unref(struct kdbus_node *node);
+
+bool kdbus_node_is_active(struct kdbus_node *node);
+bool kdbus_node_is_deactivated(struct kdbus_node *node);
+bool kdbus_node_activate(struct kdbus_node *node);
+void kdbus_node_deactivate(struct kdbus_node *node);
+
+bool kdbus_node_acquire(struct kdbus_node *node);
+void kdbus_node_release(struct kdbus_node *node);
+
+struct kdbus_node *kdbus_node_find_child(struct kdbus_node *node,
+ const char *name);
+struct kdbus_node *kdbus_node_find_closest(struct kdbus_node *node,
+ unsigned int hash);
+struct kdbus_node *kdbus_node_next_child(struct kdbus_node *node,
+ struct kdbus_node *prev);
+
+#endif
diff --git a/ipc/kdbus/notify.c b/ipc/kdbus/notify.c
new file mode 100644
index 000000000..375758c48
--- /dev/null
+++ b/ipc/kdbus/notify.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "item.h"
+#include "message.h"
+#include "notify.h"
+
+static inline void kdbus_notify_add_tail(struct kdbus_staging *staging,
+ struct kdbus_bus *bus)
+{
+ spin_lock(&bus->notify_lock);
+ list_add_tail(&staging->notify_entry, &bus->notify_list);
+ spin_unlock(&bus->notify_lock);
+}
+
+static int kdbus_notify_reply(struct kdbus_bus *bus, u64 id,
+ u64 cookie, u64 msg_type)
+{
+ struct kdbus_staging *s;
+
+ s = kdbus_staging_new_kernel(bus, id, cookie, 0, msg_type);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ kdbus_notify_add_tail(s, bus);
+ return 0;
+}
+
+/**
+ * kdbus_notify_reply_timeout() - queue a timeout reply
+ * @bus: Bus which queues the messages
+ * @id: The destination's connection ID
+ * @cookie: The cookie to set in the reply.
+ *
+ * Queues a message that has a KDBUS_ITEM_REPLY_TIMEOUT item attached.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_notify_reply_timeout(struct kdbus_bus *bus, u64 id, u64 cookie)
+{
+ return kdbus_notify_reply(bus, id, cookie, KDBUS_ITEM_REPLY_TIMEOUT);
+}
+
+/**
+ * kdbus_notify_reply_dead() - queue a 'dead' reply
+ * @bus: Bus which queues the messages
+ * @id: The destination's connection ID
+ * @cookie: The cookie to set in the reply.
+ *
+ * Queues a message that has a KDBUS_ITEM_REPLY_DEAD item attached.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_notify_reply_dead(struct kdbus_bus *bus, u64 id, u64 cookie)
+{
+ return kdbus_notify_reply(bus, id, cookie, KDBUS_ITEM_REPLY_DEAD);
+}
+
+/**
+ * kdbus_notify_name_change() - queue a notification about a name owner change
+ * @bus: Bus which queues the messages
+ * @type: The type if the notification; KDBUS_ITEM_NAME_ADD,
+ * KDBUS_ITEM_NAME_CHANGE or KDBUS_ITEM_NAME_REMOVE
+ * @old_id: The id of the connection that used to own the name
+ * @new_id: The id of the new owner connection
+ * @old_flags: The flags to pass in the KDBUS_ITEM flags field for
+ * the old owner
+ * @new_flags: The flags to pass in the KDBUS_ITEM flags field for
+ * the new owner
+ * @name: The name that was removed or assigned to a new owner
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_notify_name_change(struct kdbus_bus *bus, u64 type,
+ u64 old_id, u64 new_id,
+ u64 old_flags, u64 new_flags,
+ const char *name)
+{
+ size_t name_len, extra_size;
+ struct kdbus_staging *s;
+
+ name_len = strlen(name) + 1;
+ extra_size = sizeof(struct kdbus_notify_name_change) + name_len;
+
+ s = kdbus_staging_new_kernel(bus, KDBUS_DST_ID_BROADCAST, 0,
+ extra_size, type);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ s->notify->name_change.old_id.id = old_id;
+ s->notify->name_change.old_id.flags = old_flags;
+ s->notify->name_change.new_id.id = new_id;
+ s->notify->name_change.new_id.flags = new_flags;
+ memcpy(s->notify->name_change.name, name, name_len);
+
+ kdbus_notify_add_tail(s, bus);
+ return 0;
+}
+
+/**
+ * kdbus_notify_id_change() - queue a notification about a unique ID change
+ * @bus: Bus which queues the messages
+ * @type: The type if the notification; KDBUS_ITEM_ID_ADD or
+ * KDBUS_ITEM_ID_REMOVE
+ * @id: The id of the connection that was added or removed
+ * @flags: The flags to pass in the KDBUS_ITEM flags field
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_notify_id_change(struct kdbus_bus *bus, u64 type, u64 id, u64 flags)
+{
+ struct kdbus_staging *s;
+ size_t extra_size;
+
+ extra_size = sizeof(struct kdbus_notify_id_change);
+ s = kdbus_staging_new_kernel(bus, KDBUS_DST_ID_BROADCAST, 0,
+ extra_size, type);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ s->notify->id_change.id = id;
+ s->notify->id_change.flags = flags;
+
+ kdbus_notify_add_tail(s, bus);
+ return 0;
+}
+
+/**
+ * kdbus_notify_flush() - send a list of collected messages
+ * @bus: Bus which queues the messages
+ *
+ * The list is empty after sending the messages.
+ */
+void kdbus_notify_flush(struct kdbus_bus *bus)
+{
+ LIST_HEAD(notify_list);
+ struct kdbus_staging *s, *tmp;
+
+ mutex_lock(&bus->notify_flush_lock);
+ down_read(&bus->name_registry->rwlock);
+
+ spin_lock(&bus->notify_lock);
+ list_splice_init(&bus->notify_list, &notify_list);
+ spin_unlock(&bus->notify_lock);
+
+ list_for_each_entry_safe(s, tmp, &notify_list, notify_entry) {
+ if (s->msg->dst_id != KDBUS_DST_ID_BROADCAST) {
+ struct kdbus_conn *conn;
+
+ conn = kdbus_bus_find_conn_by_id(bus, s->msg->dst_id);
+ if (conn) {
+ kdbus_bus_eavesdrop(bus, NULL, s);
+ kdbus_conn_entry_insert(NULL, conn, s, NULL,
+ NULL);
+ kdbus_conn_unref(conn);
+ }
+ } else {
+ kdbus_bus_broadcast(bus, NULL, s);
+ }
+
+ list_del(&s->notify_entry);
+ kdbus_staging_free(s);
+ }
+
+ up_read(&bus->name_registry->rwlock);
+ mutex_unlock(&bus->notify_flush_lock);
+}
+
+/**
+ * kdbus_notify_free() - free a list of collected messages
+ * @bus: Bus which queues the messages
+ */
+void kdbus_notify_free(struct kdbus_bus *bus)
+{
+ struct kdbus_staging *s, *tmp;
+
+ list_for_each_entry_safe(s, tmp, &bus->notify_list, notify_entry) {
+ list_del(&s->notify_entry);
+ kdbus_staging_free(s);
+ }
+}
diff --git a/ipc/kdbus/notify.h b/ipc/kdbus/notify.h
new file mode 100644
index 000000000..03df464cb
--- /dev/null
+++ b/ipc/kdbus/notify.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_NOTIFY_H
+#define __KDBUS_NOTIFY_H
+
+struct kdbus_bus;
+
+int kdbus_notify_id_change(struct kdbus_bus *bus, u64 type, u64 id, u64 flags);
+int kdbus_notify_reply_timeout(struct kdbus_bus *bus, u64 id, u64 cookie);
+int kdbus_notify_reply_dead(struct kdbus_bus *bus, u64 id, u64 cookie);
+int kdbus_notify_name_change(struct kdbus_bus *bus, u64 type,
+ u64 old_id, u64 new_id,
+ u64 old_flags, u64 new_flags,
+ const char *name);
+void kdbus_notify_flush(struct kdbus_bus *bus);
+void kdbus_notify_free(struct kdbus_bus *bus);
+
+#endif
diff --git a/ipc/kdbus/policy.c b/ipc/kdbus/policy.c
new file mode 100644
index 000000000..f2618e15e
--- /dev/null
+++ b/ipc/kdbus/policy.c
@@ -0,0 +1,489 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "domain.h"
+#include "item.h"
+#include "names.h"
+#include "policy.h"
+
+#define KDBUS_POLICY_HASH_SIZE 64
+
+/**
+ * struct kdbus_policy_db_entry_access - a database entry access item
+ * @type: One of KDBUS_POLICY_ACCESS_* types
+ * @access: Access to grant. One of KDBUS_POLICY_*
+ * @uid: For KDBUS_POLICY_ACCESS_USER, the global uid
+ * @gid: For KDBUS_POLICY_ACCESS_GROUP, the global gid
+ * @list: List entry item for the entry's list
+ *
+ * This is the internal version of struct kdbus_policy_db_access.
+ */
+struct kdbus_policy_db_entry_access {
+ u8 type; /* USER, GROUP, WORLD */
+ u8 access; /* OWN, TALK, SEE */
+ union {
+ kuid_t uid; /* global uid */
+ kgid_t gid; /* global gid */
+ };
+ struct list_head list;
+};
+
+/**
+ * struct kdbus_policy_db_entry - a policy database entry
+ * @name: The name to match the policy entry against
+ * @hentry: The hash entry for the database's entries_hash
+ * @access_list: List head for keeping tracks of the entry's
+ * access items.
+ * @owner: The owner of this entry. Can be a kdbus_conn or
+ * a kdbus_ep object.
+ * @wildcard: The name is a wildcard, such as ending on '.*'
+ */
+struct kdbus_policy_db_entry {
+ char *name;
+ struct hlist_node hentry;
+ struct list_head access_list;
+ const void *owner;
+ bool wildcard:1;
+};
+
+static void kdbus_policy_entry_free(struct kdbus_policy_db_entry *e)
+{
+ struct kdbus_policy_db_entry_access *a, *tmp;
+
+ list_for_each_entry_safe(a, tmp, &e->access_list, list) {
+ list_del(&a->list);
+ kfree(a);
+ }
+
+ kfree(e->name);
+ kfree(e);
+}
+
+static unsigned int kdbus_strnhash(const char *str, size_t len)
+{
+ unsigned long hash = init_name_hash();
+
+ while (len--)
+ hash = partial_name_hash(*str++, hash);
+
+ return end_name_hash(hash);
+}
+
+static const struct kdbus_policy_db_entry *
+kdbus_policy_lookup(struct kdbus_policy_db *db, const char *name, u32 hash)
+{
+ struct kdbus_policy_db_entry *e;
+ const char *dot;
+ size_t len;
+
+ /* find exact match */
+ hash_for_each_possible(db->entries_hash, e, hentry, hash)
+ if (strcmp(e->name, name) == 0 && !e->wildcard)
+ return e;
+
+ /* find wildcard match */
+
+ dot = strrchr(name, '.');
+ if (!dot)
+ return NULL;
+
+ len = dot - name;
+ hash = kdbus_strnhash(name, len);
+
+ hash_for_each_possible(db->entries_hash, e, hentry, hash)
+ if (e->wildcard && !strncmp(e->name, name, len) &&
+ !e->name[len])
+ return e;
+
+ return NULL;
+}
+
+/**
+ * kdbus_policy_db_clear - release all memory from a policy db
+ * @db: The policy database
+ */
+void kdbus_policy_db_clear(struct kdbus_policy_db *db)
+{
+ struct kdbus_policy_db_entry *e;
+ struct hlist_node *tmp;
+ unsigned int i;
+
+ /* purge entries */
+ down_write(&db->entries_rwlock);
+ hash_for_each_safe(db->entries_hash, i, tmp, e, hentry) {
+ hash_del(&e->hentry);
+ kdbus_policy_entry_free(e);
+ }
+ up_write(&db->entries_rwlock);
+}
+
+/**
+ * kdbus_policy_db_init() - initialize a new policy database
+ * @db: The location of the database
+ *
+ * This initializes a new policy-db. The underlying memory must have been
+ * cleared to zero by the caller.
+ */
+void kdbus_policy_db_init(struct kdbus_policy_db *db)
+{
+ hash_init(db->entries_hash);
+ init_rwsem(&db->entries_rwlock);
+}
+
+/**
+ * kdbus_policy_query_unlocked() - Query the policy database
+ * @db: Policy database
+ * @cred: Credentials to test against
+ * @name: Name to query
+ * @hash: Hash value of @name
+ *
+ * Same as kdbus_policy_query() but requires the caller to lock the policy
+ * database against concurrent writes.
+ *
+ * Return: The highest KDBUS_POLICY_* access type found, or -EPERM if none.
+ */
+int kdbus_policy_query_unlocked(struct kdbus_policy_db *db,
+ const struct cred *cred, const char *name,
+ unsigned int hash)
+{
+ struct kdbus_policy_db_entry_access *a;
+ const struct kdbus_policy_db_entry *e;
+ int i, highest = -EPERM;
+
+ e = kdbus_policy_lookup(db, name, hash);
+ if (!e)
+ return -EPERM;
+
+ list_for_each_entry(a, &e->access_list, list) {
+ if ((int)a->access <= highest)
+ continue;
+
+ switch (a->type) {
+ case KDBUS_POLICY_ACCESS_USER:
+ if (uid_eq(cred->euid, a->uid))
+ highest = a->access;
+ break;
+ case KDBUS_POLICY_ACCESS_GROUP:
+ if (gid_eq(cred->egid, a->gid)) {
+ highest = a->access;
+ break;
+ }
+
+ for (i = 0; i < cred->group_info->ngroups; i++) {
+ kgid_t gid = GROUP_AT(cred->group_info, i);
+
+ if (gid_eq(gid, a->gid)) {
+ highest = a->access;
+ break;
+ }
+ }
+
+ break;
+ case KDBUS_POLICY_ACCESS_WORLD:
+ highest = a->access;
+ break;
+ }
+
+ /* OWN is the highest possible policy */
+ if (highest >= KDBUS_POLICY_OWN)
+ break;
+ }
+
+ return highest;
+}
+
+/**
+ * kdbus_policy_query() - Query the policy database
+ * @db: Policy database
+ * @cred: Credentials to test against
+ * @name: Name to query
+ * @hash: Hash value of @name
+ *
+ * Query the policy database @db for the access rights of @cred to the name
+ * @name. The access rights of @cred are returned, or -EPERM if no access is
+ * granted.
+ *
+ * This call effectively searches for the highest access-right granted to
+ * @cred. The caller should really cache those as policy lookups are rather
+ * expensive.
+ *
+ * Return: The highest KDBUS_POLICY_* access type found, or -EPERM if none.
+ */
+int kdbus_policy_query(struct kdbus_policy_db *db, const struct cred *cred,
+ const char *name, unsigned int hash)
+{
+ int ret;
+
+ down_read(&db->entries_rwlock);
+ ret = kdbus_policy_query_unlocked(db, cred, name, hash);
+ up_read(&db->entries_rwlock);
+
+ return ret;
+}
+
+static void __kdbus_policy_remove_owner(struct kdbus_policy_db *db,
+ const void *owner)
+{
+ struct kdbus_policy_db_entry *e;
+ struct hlist_node *tmp;
+ int i;
+
+ hash_for_each_safe(db->entries_hash, i, tmp, e, hentry)
+ if (e->owner == owner) {
+ hash_del(&e->hentry);
+ kdbus_policy_entry_free(e);
+ }
+}
+
+/**
+ * kdbus_policy_remove_owner() - remove all entries related to a connection
+ * @db: The policy database
+ * @owner: The connection which items to remove
+ */
+void kdbus_policy_remove_owner(struct kdbus_policy_db *db,
+ const void *owner)
+{
+ down_write(&db->entries_rwlock);
+ __kdbus_policy_remove_owner(db, owner);
+ up_write(&db->entries_rwlock);
+}
+
+/*
+ * Convert user provided policy access to internal kdbus policy
+ * access
+ */
+static struct kdbus_policy_db_entry_access *
+kdbus_policy_make_access(const struct kdbus_policy_access *uaccess)
+{
+ int ret;
+ struct kdbus_policy_db_entry_access *a;
+
+ a = kzalloc(sizeof(*a), GFP_KERNEL);
+ if (!a)
+ return ERR_PTR(-ENOMEM);
+
+ ret = -EINVAL;
+ switch (uaccess->access) {
+ case KDBUS_POLICY_SEE:
+ case KDBUS_POLICY_TALK:
+ case KDBUS_POLICY_OWN:
+ a->access = uaccess->access;
+ break;
+ default:
+ goto err;
+ }
+
+ switch (uaccess->type) {
+ case KDBUS_POLICY_ACCESS_USER:
+ a->uid = make_kuid(current_user_ns(), uaccess->id);
+ if (!uid_valid(a->uid))
+ goto err;
+
+ break;
+ case KDBUS_POLICY_ACCESS_GROUP:
+ a->gid = make_kgid(current_user_ns(), uaccess->id);
+ if (!gid_valid(a->gid))
+ goto err;
+
+ break;
+ case KDBUS_POLICY_ACCESS_WORLD:
+ break;
+ default:
+ goto err;
+ }
+
+ a->type = uaccess->type;
+
+ return a;
+
+err:
+ kfree(a);
+ return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_policy_set() - set a connection's policy rules
+ * @db: The policy database
+ * @items: A list of kdbus_item elements that contain both
+ * names and access rules to set.
+ * @items_size: The total size of the items.
+ * @max_policies: The maximum number of policy entries to allow.
+ * Pass 0 for no limit.
+ * @allow_wildcards: Boolean value whether wildcard entries (such
+ * ending on '.*') should be allowed.
+ * @owner: The owner of the new policy items.
+ *
+ * This function sets a new set of policies for a given owner. The names and
+ * access rules are gathered by walking the list of items passed in as
+ * argument. An item of type KDBUS_ITEM_NAME is expected before any number of
+ * KDBUS_ITEM_POLICY_ACCESS items. If there are more repetitions of this
+ * pattern than denoted in @max_policies, -EINVAL is returned.
+ *
+ * In order to allow atomic replacement of rules, the function first removes
+ * all entries that have been created for the given owner previously.
+ *
+ * Callers to this function must make sure that the owner is a custom
+ * endpoint, or if the endpoint is a default endpoint, then it must be
+ * either a policy holder or an activator.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_policy_set(struct kdbus_policy_db *db,
+ const struct kdbus_item *items,
+ size_t items_size,
+ size_t max_policies,
+ bool allow_wildcards,
+ const void *owner)
+{
+ struct kdbus_policy_db_entry_access *a;
+ struct kdbus_policy_db_entry *e, *p;
+ const struct kdbus_item *item;
+ struct hlist_node *tmp;
+ HLIST_HEAD(entries);
+ HLIST_HEAD(restore);
+ size_t count = 0;
+ int i, ret = 0;
+ u32 hash;
+
+ /* Walk the list of items and look for new policies */
+ e = NULL;
+ KDBUS_ITEMS_FOREACH(item, items, items_size) {
+ switch (item->type) {
+ case KDBUS_ITEM_NAME: {
+ size_t len;
+
+ if (max_policies && ++count > max_policies) {
+ ret = -E2BIG;
+ goto exit;
+ }
+
+ if (!kdbus_name_is_valid(item->str, true)) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ e = kzalloc(sizeof(*e), GFP_KERNEL);
+ if (!e) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ INIT_LIST_HEAD(&e->access_list);
+ e->owner = owner;
+ hlist_add_head(&e->hentry, &entries);
+
+ e->name = kstrdup(item->str, GFP_KERNEL);
+ if (!e->name) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ /*
+ * If a supplied name ends with an '.*', cut off that
+ * part, only store anything before it, and mark the
+ * entry as wildcard.
+ */
+ len = strlen(e->name);
+ if (len > 2 &&
+ e->name[len - 3] == '.' &&
+ e->name[len - 2] == '*') {
+ if (!allow_wildcards) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ e->name[len - 3] = '\0';
+ e->wildcard = true;
+ }
+
+ break;
+ }
+
+ case KDBUS_ITEM_POLICY_ACCESS:
+ if (!e) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ a = kdbus_policy_make_access(&item->policy_access);
+ if (IS_ERR(a)) {
+ ret = PTR_ERR(a);
+ goto exit;
+ }
+
+ list_add_tail(&a->list, &e->access_list);
+ break;
+ }
+ }
+
+ down_write(&db->entries_rwlock);
+
+ /* remember previous entries to restore in case of failure */
+ hash_for_each_safe(db->entries_hash, i, tmp, e, hentry)
+ if (e->owner == owner) {
+ hash_del(&e->hentry);
+ hlist_add_head(&e->hentry, &restore);
+ }
+
+ hlist_for_each_entry_safe(e, tmp, &entries, hentry) {
+ /* prevent duplicates */
+ hash = kdbus_strhash(e->name);
+ hash_for_each_possible(db->entries_hash, p, hentry, hash)
+ if (strcmp(e->name, p->name) == 0 &&
+ e->wildcard == p->wildcard) {
+ ret = -EEXIST;
+ goto restore;
+ }
+
+ hlist_del(&e->hentry);
+ hash_add(db->entries_hash, &e->hentry, hash);
+ }
+
+restore:
+ /* if we failed, flush all entries we added so far */
+ if (ret < 0)
+ __kdbus_policy_remove_owner(db, owner);
+
+ /* if we failed, restore entries, otherwise release them */
+ hlist_for_each_entry_safe(e, tmp, &restore, hentry) {
+ hlist_del(&e->hentry);
+ if (ret < 0) {
+ hash = kdbus_strhash(e->name);
+ hash_add(db->entries_hash, &e->hentry, hash);
+ } else {
+ kdbus_policy_entry_free(e);
+ }
+ }
+
+ up_write(&db->entries_rwlock);
+
+exit:
+ hlist_for_each_entry_safe(e, tmp, &entries, hentry) {
+ hlist_del(&e->hentry);
+ kdbus_policy_entry_free(e);
+ }
+
+ return ret;
+}
diff --git a/ipc/kdbus/policy.h b/ipc/kdbus/policy.h
new file mode 100644
index 000000000..15dd7bc12
--- /dev/null
+++ b/ipc/kdbus/policy.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_POLICY_H
+#define __KDBUS_POLICY_H
+
+#include <linux/hashtable.h>
+#include <linux/rwsem.h>
+
+struct kdbus_conn;
+struct kdbus_item;
+
+/**
+ * struct kdbus_policy_db - policy database
+ * @entries_hash: Hashtable of entries
+ * @entries_rwlock: Mutex to protect the database's access entries
+ */
+struct kdbus_policy_db {
+ DECLARE_HASHTABLE(entries_hash, 6);
+ struct rw_semaphore entries_rwlock;
+};
+
+void kdbus_policy_db_init(struct kdbus_policy_db *db);
+void kdbus_policy_db_clear(struct kdbus_policy_db *db);
+
+int kdbus_policy_query_unlocked(struct kdbus_policy_db *db,
+ const struct cred *cred, const char *name,
+ unsigned int hash);
+int kdbus_policy_query(struct kdbus_policy_db *db, const struct cred *cred,
+ const char *name, unsigned int hash);
+
+void kdbus_policy_remove_owner(struct kdbus_policy_db *db,
+ const void *owner);
+int kdbus_policy_set(struct kdbus_policy_db *db,
+ const struct kdbus_item *items,
+ size_t items_size,
+ size_t max_policies,
+ bool allow_wildcards,
+ const void *owner);
+
+#endif
diff --git a/ipc/kdbus/pool.c b/ipc/kdbus/pool.c
new file mode 100644
index 000000000..63ccd5571
--- /dev/null
+++ b/ipc/kdbus/pool.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/aio.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/shmem_fs.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+
+#include "pool.h"
+#include "util.h"
+
+/**
+ * struct kdbus_pool - the receiver's buffer
+ * @f: The backing shmem file
+ * @size: The size of the file
+ * @accounted_size: Currently accounted memory in bytes
+ * @lock: Pool data lock
+ * @slices: All slices sorted by address
+ * @slices_busy: Tree of allocated slices
+ * @slices_free: Tree of free slices
+ *
+ * The receiver's buffer, managed as a pool of allocated and free
+ * slices containing the queued messages.
+ *
+ * Messages sent with KDBUS_CMD_SEND are copied directly by the
+ * sending process into the receiver's pool.
+ *
+ * Messages received with KDBUS_CMD_RECV just return the offset
+ * to the data placed in the pool.
+ *
+ * The internally allocated memory needs to be returned by the receiver
+ * with KDBUS_CMD_FREE.
+ */
+struct kdbus_pool {
+ struct file *f;
+ size_t size;
+ size_t accounted_size;
+ struct mutex lock;
+
+ struct list_head slices;
+ struct rb_root slices_busy;
+ struct rb_root slices_free;
+};
+
+/**
+ * struct kdbus_pool_slice - allocated element in kdbus_pool
+ * @pool: Pool this slice belongs to
+ * @off: Offset of slice in the shmem file
+ * @size: Size of slice
+ * @entry: Entry in "all slices" list
+ * @rb_node: Entry in free or busy list
+ * @free: Unused slice
+ * @accounted: Accounted as queue slice
+ * @ref_kernel: Kernel holds a reference
+ * @ref_user: Userspace holds a reference
+ *
+ * The pool has one or more slices, always spanning the entire size of the
+ * pool.
+ *
+ * Every slice is an element in a list sorted by the buffer address, to
+ * provide access to the next neighbor slice.
+ *
+ * Every slice is member in either the busy or the free tree. The free
+ * tree is organized by slice size, the busy tree organized by buffer
+ * offset.
+ */
+struct kdbus_pool_slice {
+ struct kdbus_pool *pool;
+ size_t off;
+ size_t size;
+
+ struct list_head entry;
+ struct rb_node rb_node;
+
+ bool free:1;
+ bool accounted:1;
+ bool ref_kernel:1;
+ bool ref_user:1;
+};
+
+static struct kdbus_pool_slice *kdbus_pool_slice_new(struct kdbus_pool *pool,
+ size_t off, size_t size)
+{
+ struct kdbus_pool_slice *slice;
+
+ slice = kzalloc(sizeof(*slice), GFP_KERNEL);
+ if (!slice)
+ return NULL;
+
+ slice->pool = pool;
+ slice->off = off;
+ slice->size = size;
+ slice->free = true;
+ return slice;
+}
+
+/* insert a slice into the free tree */
+static void kdbus_pool_add_free_slice(struct kdbus_pool *pool,
+ struct kdbus_pool_slice *slice)
+{
+ struct rb_node **n;
+ struct rb_node *pn = NULL;
+
+ n = &pool->slices_free.rb_node;
+ while (*n) {
+ struct kdbus_pool_slice *pslice;
+
+ pn = *n;
+ pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node);
+ if (slice->size < pslice->size)
+ n = &pn->rb_left;
+ else
+ n = &pn->rb_right;
+ }
+
+ rb_link_node(&slice->rb_node, pn, n);
+ rb_insert_color(&slice->rb_node, &pool->slices_free);
+}
+
+/* insert a slice into the busy tree */
+static void kdbus_pool_add_busy_slice(struct kdbus_pool *pool,
+ struct kdbus_pool_slice *slice)
+{
+ struct rb_node **n;
+ struct rb_node *pn = NULL;
+
+ n = &pool->slices_busy.rb_node;
+ while (*n) {
+ struct kdbus_pool_slice *pslice;
+
+ pn = *n;
+ pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node);
+ if (slice->off < pslice->off)
+ n = &pn->rb_left;
+ else if (slice->off > pslice->off)
+ n = &pn->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&slice->rb_node, pn, n);
+ rb_insert_color(&slice->rb_node, &pool->slices_busy);
+}
+
+static struct kdbus_pool_slice *kdbus_pool_find_slice(struct kdbus_pool *pool,
+ size_t off)
+{
+ struct rb_node *n;
+
+ n = pool->slices_busy.rb_node;
+ while (n) {
+ struct kdbus_pool_slice *s;
+
+ s = rb_entry(n, struct kdbus_pool_slice, rb_node);
+ if (off < s->off)
+ n = n->rb_left;
+ else if (off > s->off)
+ n = n->rb_right;
+ else
+ return s;
+ }
+
+ return NULL;
+}
+
+/**
+ * kdbus_pool_slice_alloc() - allocate memory from a pool
+ * @pool: The receiver's pool
+ * @size: The number of bytes to allocate
+ * @accounted: Whether this slice should be accounted for
+ *
+ * The returned slice is used for kdbus_pool_slice_release() to
+ * free the allocated memory. If either @kvec or @iovec is non-NULL, the data
+ * will be copied from kernel or userspace memory into the new slice at
+ * offset 0.
+ *
+ * Return: the allocated slice on success, ERR_PTR on failure.
+ */
+struct kdbus_pool_slice *kdbus_pool_slice_alloc(struct kdbus_pool *pool,
+ size_t size, bool accounted)
+{
+ size_t slice_size = KDBUS_ALIGN8(size);
+ struct rb_node *n, *found = NULL;
+ struct kdbus_pool_slice *s;
+ int ret = 0;
+
+ if (WARN_ON(!size))
+ return ERR_PTR(-EINVAL);
+
+ /* search a free slice with the closest matching size */
+ mutex_lock(&pool->lock);
+ n = pool->slices_free.rb_node;
+ while (n) {
+ s = rb_entry(n, struct kdbus_pool_slice, rb_node);
+ if (slice_size < s->size) {
+ found = n;
+ n = n->rb_left;
+ } else if (slice_size > s->size) {
+ n = n->rb_right;
+ } else {
+ found = n;
+ break;
+ }
+ }
+
+ /* no slice with the minimum size found in the pool */
+ if (!found) {
+ ret = -EXFULL;
+ goto exit_unlock;
+ }
+
+ /* no exact match, use the closest one */
+ if (!n) {
+ struct kdbus_pool_slice *s_new;
+
+ s = rb_entry(found, struct kdbus_pool_slice, rb_node);
+
+ /* split-off the remainder of the size to its own slice */
+ s_new = kdbus_pool_slice_new(pool, s->off + slice_size,
+ s->size - slice_size);
+ if (!s_new) {
+ ret = -ENOMEM;
+ goto exit_unlock;
+ }
+
+ list_add(&s_new->entry, &s->entry);
+ kdbus_pool_add_free_slice(pool, s_new);
+
+ /* adjust our size now that we split-off another slice */
+ s->size = slice_size;
+ }
+
+ /* move slice from free to the busy tree */
+ rb_erase(found, &pool->slices_free);
+ kdbus_pool_add_busy_slice(pool, s);
+
+ WARN_ON(s->ref_kernel || s->ref_user);
+
+ s->ref_kernel = true;
+ s->free = false;
+ s->accounted = accounted;
+ if (accounted)
+ pool->accounted_size += s->size;
+ mutex_unlock(&pool->lock);
+
+ return s;
+
+exit_unlock:
+ mutex_unlock(&pool->lock);
+ return ERR_PTR(ret);
+}
+
+static void __kdbus_pool_slice_release(struct kdbus_pool_slice *slice)
+{
+ struct kdbus_pool *pool = slice->pool;
+
+ /* don't free the slice if either has a reference */
+ if (slice->ref_kernel || slice->ref_user)
+ return;
+
+ if (WARN_ON(slice->free))
+ return;
+
+ rb_erase(&slice->rb_node, &pool->slices_busy);
+
+ /* merge with the next free slice */
+ if (!list_is_last(&slice->entry, &pool->slices)) {
+ struct kdbus_pool_slice *s;
+
+ s = list_entry(slice->entry.next,
+ struct kdbus_pool_slice, entry);
+ if (s->free) {
+ rb_erase(&s->rb_node, &pool->slices_free);
+ list_del(&s->entry);
+ slice->size += s->size;
+ kfree(s);
+ }
+ }
+
+ /* merge with previous free slice */
+ if (pool->slices.next != &slice->entry) {
+ struct kdbus_pool_slice *s;
+
+ s = list_entry(slice->entry.prev,
+ struct kdbus_pool_slice, entry);
+ if (s->free) {
+ rb_erase(&s->rb_node, &pool->slices_free);
+ list_del(&slice->entry);
+ s->size += slice->size;
+ kfree(slice);
+ slice = s;
+ }
+ }
+
+ slice->free = true;
+ kdbus_pool_add_free_slice(pool, slice);
+}
+
+/**
+ * kdbus_pool_slice_release() - drop kernel-reference on allocated slice
+ * @slice: Slice allocated from the pool
+ *
+ * This releases the kernel-reference on the given slice. If the
+ * kernel-reference and the user-reference on a slice are dropped, the slice is
+ * returned to the pool.
+ *
+ * So far, we do not implement full ref-counting on slices. Each, kernel and
+ * user-space can have exactly one reference to a slice. If both are dropped at
+ * the same time, the slice is released.
+ */
+void kdbus_pool_slice_release(struct kdbus_pool_slice *slice)
+{
+ struct kdbus_pool *pool;
+
+ if (!slice)
+ return;
+
+ /* @slice may be freed, so keep local ptr to @pool */
+ pool = slice->pool;
+
+ mutex_lock(&pool->lock);
+ /* kernel must own a ref to @slice to drop it */
+ WARN_ON(!slice->ref_kernel);
+ slice->ref_kernel = false;
+ /* no longer kernel-owned, de-account slice */
+ if (slice->accounted && !WARN_ON(pool->accounted_size < slice->size))
+ pool->accounted_size -= slice->size;
+ __kdbus_pool_slice_release(slice);
+ mutex_unlock(&pool->lock);
+}
+
+/**
+ * kdbus_pool_release_offset() - release a public offset
+ * @pool: pool to operate on
+ * @off: offset to release
+ *
+ * This should be called whenever user-space frees a slice given to them. It
+ * verifies the slice is available and public, and then drops it. It ensures
+ * correct locking and barriers against queues.
+ *
+ * Return: 0 on success, ENXIO if the offset is invalid or not public.
+ */
+int kdbus_pool_release_offset(struct kdbus_pool *pool, size_t off)
+{
+ struct kdbus_pool_slice *slice;
+ int ret = 0;
+
+ /* 'pool->size' is used as dummy offset for empty slices */
+ if (off == pool->size)
+ return 0;
+
+ mutex_lock(&pool->lock);
+ slice = kdbus_pool_find_slice(pool, off);
+ if (slice && slice->ref_user) {
+ slice->ref_user = false;
+ __kdbus_pool_slice_release(slice);
+ } else {
+ ret = -ENXIO;
+ }
+ mutex_unlock(&pool->lock);
+
+ return ret;
+}
+
+/**
+ * kdbus_pool_publish_empty() - publish empty slice to user-space
+ * @pool: pool to operate on
+ * @off: output storage for offset, or NULL
+ * @size: output storage for size, or NULL
+ *
+ * This is the same as kdbus_pool_slice_publish(), but uses a dummy slice with
+ * size 0. The returned offset points to the end of the pool and is never
+ * returned on real slices.
+ */
+void kdbus_pool_publish_empty(struct kdbus_pool *pool, u64 *off, u64 *size)
+{
+ if (off)
+ *off = pool->size;
+ if (size)
+ *size = 0;
+}
+
+/**
+ * kdbus_pool_slice_publish() - publish slice to user-space
+ * @slice: The slice
+ * @out_offset: Output storage for offset, or NULL
+ * @out_size: Output storage for size, or NULL
+ *
+ * This prepares a slice to be published to user-space.
+ *
+ * This call combines the following operations:
+ * * the memory region is flushed so the user's memory view is consistent
+ * * the slice is marked as referenced by user-space, so user-space has to
+ * call KDBUS_CMD_FREE to release it
+ * * the offset and size of the slice are written to the given output
+ * arguments, if non-NULL
+ */
+void kdbus_pool_slice_publish(struct kdbus_pool_slice *slice,
+ u64 *out_offset, u64 *out_size)
+{
+ mutex_lock(&slice->pool->lock);
+ /* kernel must own a ref to @slice to gain a user-space ref */
+ WARN_ON(!slice->ref_kernel);
+ slice->ref_user = true;
+ mutex_unlock(&slice->pool->lock);
+
+ if (out_offset)
+ *out_offset = slice->off;
+ if (out_size)
+ *out_size = slice->size;
+}
+
+/**
+ * kdbus_pool_slice_offset() - Get a slice's offset inside the pool
+ * @slice: Slice to return the offset of
+ *
+ * Return: The internal offset @slice inside the pool.
+ */
+off_t kdbus_pool_slice_offset(const struct kdbus_pool_slice *slice)
+{
+ return slice->off;
+}
+
+/**
+ * kdbus_pool_slice_size() - get size of a pool slice
+ * @slice: slice to query
+ *
+ * Return: size of the given slice
+ */
+size_t kdbus_pool_slice_size(const struct kdbus_pool_slice *slice)
+{
+ return slice->size;
+}
+
+/**
+ * kdbus_pool_new() - create a new pool
+ * @name: Name of the (deleted) file which shows up in
+ * /proc, used for debugging
+ * @size: Maximum size of the pool
+ *
+ * Return: a new kdbus_pool on success, ERR_PTR on failure.
+ */
+struct kdbus_pool *kdbus_pool_new(const char *name, size_t size)
+{
+ struct kdbus_pool_slice *s;
+ struct kdbus_pool *p;
+ struct file *f;
+ char *n = NULL;
+ int ret;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ if (name) {
+ n = kasprintf(GFP_KERNEL, KBUILD_MODNAME "-conn:%s", name);
+ if (!n) {
+ ret = -ENOMEM;
+ goto exit_free;
+ }
+ }
+
+ f = shmem_file_setup(n ?: KBUILD_MODNAME "-conn", size, 0);
+ kfree(n);
+
+ if (IS_ERR(f)) {
+ ret = PTR_ERR(f);
+ goto exit_free;
+ }
+
+ ret = get_write_access(file_inode(f));
+ if (ret < 0)
+ goto exit_put_shmem;
+
+ /* allocate first slice spanning the entire pool */
+ s = kdbus_pool_slice_new(p, 0, size);
+ if (!s) {
+ ret = -ENOMEM;
+ goto exit_put_write;
+ }
+
+ p->f = f;
+ p->size = size;
+ p->slices_free = RB_ROOT;
+ p->slices_busy = RB_ROOT;
+ mutex_init(&p->lock);
+
+ INIT_LIST_HEAD(&p->slices);
+ list_add(&s->entry, &p->slices);
+
+ kdbus_pool_add_free_slice(p, s);
+ return p;
+
+exit_put_write:
+ put_write_access(file_inode(f));
+exit_put_shmem:
+ fput(f);
+exit_free:
+ kfree(p);
+ return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_pool_free() - destroy pool
+ * @pool: The receiver's pool
+ */
+void kdbus_pool_free(struct kdbus_pool *pool)
+{
+ struct kdbus_pool_slice *s, *tmp;
+
+ if (!pool)
+ return;
+
+ list_for_each_entry_safe(s, tmp, &pool->slices, entry) {
+ list_del(&s->entry);
+ kfree(s);
+ }
+
+ put_write_access(file_inode(pool->f));
+ fput(pool->f);
+ kfree(pool);
+}
+
+/**
+ * kdbus_pool_accounted() - retrieve accounting information
+ * @pool: pool to query
+ * @size: output for overall pool size
+ * @acc: output for currently accounted size
+ *
+ * This returns accounting information of the pool. Note that the data might
+ * change after the function returns, as the pool lock is dropped. You need to
+ * protect the data via other means, if you need reliable accounting.
+ */
+void kdbus_pool_accounted(struct kdbus_pool *pool, size_t *size, size_t *acc)
+{
+ mutex_lock(&pool->lock);
+ if (size)
+ *size = pool->size;
+ if (acc)
+ *acc = pool->accounted_size;
+ mutex_unlock(&pool->lock);
+}
+
+/**
+ * kdbus_pool_slice_copy_iovec() - copy user memory to a slice
+ * @slice: The slice to write to
+ * @off: Offset in the slice to write to
+ * @iov: iovec array, pointing to data to copy
+ * @iov_len: Number of elements in @iov
+ * @total_len: Total number of bytes described in members of @iov
+ *
+ * User memory referenced by @iov will be copied into @slice at offset @off.
+ *
+ * Return: the numbers of bytes copied, negative errno on failure.
+ */
+ssize_t
+kdbus_pool_slice_copy_iovec(const struct kdbus_pool_slice *slice, loff_t off,
+ struct iovec *iov, size_t iov_len, size_t total_len)
+{
+ struct iov_iter iter;
+ ssize_t len;
+
+ if (WARN_ON(off + total_len > slice->size))
+ return -EFAULT;
+
+ off += slice->off;
+ iov_iter_init(&iter, WRITE, iov, iov_len, total_len);
+ len = vfs_iter_write(slice->pool->f, &iter, &off);
+
+ return (len >= 0 && len != total_len) ? -EFAULT : len;
+}
+
+/**
+ * kdbus_pool_slice_copy_kvec() - copy kernel memory to a slice
+ * @slice: The slice to write to
+ * @off: Offset in the slice to write to
+ * @kvec: kvec array, pointing to data to copy
+ * @kvec_len: Number of elements in @kvec
+ * @total_len: Total number of bytes described in members of @kvec
+ *
+ * Kernel memory referenced by @kvec will be copied into @slice at offset @off.
+ *
+ * Return: the numbers of bytes copied, negative errno on failure.
+ */
+ssize_t kdbus_pool_slice_copy_kvec(const struct kdbus_pool_slice *slice,
+ loff_t off, struct kvec *kvec,
+ size_t kvec_len, size_t total_len)
+{
+ struct iov_iter iter;
+ mm_segment_t old_fs;
+ ssize_t len;
+
+ if (WARN_ON(off + total_len > slice->size))
+ return -EFAULT;
+
+ off += slice->off;
+ iov_iter_kvec(&iter, WRITE | ITER_KVEC, kvec, kvec_len, total_len);
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ len = vfs_iter_write(slice->pool->f, &iter, &off);
+ set_fs(old_fs);
+
+ return (len >= 0 && len != total_len) ? -EFAULT : len;
+}
+
+/**
+ * kdbus_pool_slice_copy() - copy data from one slice into another
+ * @slice_dst: destination slice
+ * @slice_src: source slice
+ *
+ * Return: 0 on success, negative error number on failure.
+ */
+int kdbus_pool_slice_copy(const struct kdbus_pool_slice *slice_dst,
+ const struct kdbus_pool_slice *slice_src)
+{
+ struct file *f_src = slice_src->pool->f;
+ struct file *f_dst = slice_dst->pool->f;
+ struct inode *i_dst = file_inode(f_dst);
+ struct address_space *mapping_dst = f_dst->f_mapping;
+ const struct address_space_operations *aops = mapping_dst->a_ops;
+ unsigned long len = slice_src->size;
+ loff_t off_src = slice_src->off;
+ loff_t off_dst = slice_dst->off;
+ mm_segment_t old_fs;
+ int ret = 0;
+
+ if (WARN_ON(slice_src->size != slice_dst->size) ||
+ WARN_ON(slice_src->free || slice_dst->free))
+ return -EINVAL;
+
+ mutex_lock(&i_dst->i_mutex);
+ old_fs = get_fs();
+ set_fs(get_ds());
+ while (len > 0) {
+ unsigned long page_off;
+ unsigned long copy_len;
+ char __user *kaddr;
+ struct page *page;
+ ssize_t n_read;
+ void *fsdata;
+ long status;
+
+ page_off = off_dst & (PAGE_CACHE_SIZE - 1);
+ copy_len = min_t(unsigned long,
+ PAGE_CACHE_SIZE - page_off, len);
+
+ status = aops->write_begin(f_dst, mapping_dst, off_dst,
+ copy_len, 0, &page, &fsdata);
+ if (unlikely(status < 0)) {
+ ret = status;
+ break;
+ }
+
+ kaddr = (char __force __user *)kmap(page) + page_off;
+ n_read = __vfs_read(f_src, kaddr, copy_len, &off_src);
+ kunmap(page);
+ mark_page_accessed(page);
+ flush_dcache_page(page);
+
+ if (unlikely(n_read != copy_len)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ status = aops->write_end(f_dst, mapping_dst, off_dst,
+ copy_len, copy_len, page, fsdata);
+ if (unlikely(status != copy_len)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ off_dst += copy_len;
+ len -= copy_len;
+ }
+ set_fs(old_fs);
+ mutex_unlock(&i_dst->i_mutex);
+
+ return ret;
+}
+
+/**
+ * kdbus_pool_mmap() - map the pool into the process
+ * @pool: The receiver's pool
+ * @vma: passed by mmap() syscall
+ *
+ * Return: the result of the mmap() call, negative errno on failure.
+ */
+int kdbus_pool_mmap(const struct kdbus_pool *pool, struct vm_area_struct *vma)
+{
+ /* deny write access to the pool */
+ if (vma->vm_flags & VM_WRITE)
+ return -EPERM;
+ vma->vm_flags &= ~VM_MAYWRITE;
+
+ /* do not allow to map more than the size of the file */
+ if ((vma->vm_end - vma->vm_start) > pool->size)
+ return -EFAULT;
+
+ /* replace the connection file with our shmem file */
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ vma->vm_file = get_file(pool->f);
+
+ return pool->f->f_op->mmap(pool->f, vma);
+}
diff --git a/ipc/kdbus/pool.h b/ipc/kdbus/pool.h
new file mode 100644
index 000000000..a9038213a
--- /dev/null
+++ b/ipc/kdbus/pool.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_POOL_H
+#define __KDBUS_POOL_H
+
+#include <linux/uio.h>
+
+struct kdbus_pool;
+struct kdbus_pool_slice;
+
+struct kdbus_pool *kdbus_pool_new(const char *name, size_t size);
+void kdbus_pool_free(struct kdbus_pool *pool);
+void kdbus_pool_accounted(struct kdbus_pool *pool, size_t *size, size_t *acc);
+int kdbus_pool_mmap(const struct kdbus_pool *pool, struct vm_area_struct *vma);
+int kdbus_pool_release_offset(struct kdbus_pool *pool, size_t off);
+void kdbus_pool_publish_empty(struct kdbus_pool *pool, u64 *off, u64 *size);
+
+struct kdbus_pool_slice *kdbus_pool_slice_alloc(struct kdbus_pool *pool,
+ size_t size, bool accounted);
+void kdbus_pool_slice_release(struct kdbus_pool_slice *slice);
+void kdbus_pool_slice_publish(struct kdbus_pool_slice *slice,
+ u64 *out_offset, u64 *out_size);
+off_t kdbus_pool_slice_offset(const struct kdbus_pool_slice *slice);
+size_t kdbus_pool_slice_size(const struct kdbus_pool_slice *slice);
+int kdbus_pool_slice_copy(const struct kdbus_pool_slice *slice_dst,
+ const struct kdbus_pool_slice *slice_src);
+ssize_t kdbus_pool_slice_copy_kvec(const struct kdbus_pool_slice *slice,
+ loff_t off, struct kvec *kvec,
+ size_t kvec_count, size_t total_len);
+ssize_t kdbus_pool_slice_copy_iovec(const struct kdbus_pool_slice *slice,
+ loff_t off, struct iovec *iov,
+ size_t iov_count, size_t total_len);
+
+#endif
diff --git a/ipc/kdbus/queue.c b/ipc/kdbus/queue.c
new file mode 100644
index 000000000..f9c44d7ba
--- /dev/null
+++ b/ipc/kdbus/queue.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/audit.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/math64.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uio.h>
+
+#include "util.h"
+#include "domain.h"
+#include "connection.h"
+#include "item.h"
+#include "message.h"
+#include "metadata.h"
+#include "queue.h"
+#include "reply.h"
+
+/**
+ * kdbus_queue_init() - initialize data structure related to a queue
+ * @queue: The queue to initialize
+ */
+void kdbus_queue_init(struct kdbus_queue *queue)
+{
+ INIT_LIST_HEAD(&queue->msg_list);
+ queue->msg_prio_queue = RB_ROOT;
+}
+
+/**
+ * kdbus_queue_peek() - Retrieves an entry from a queue
+ * @queue: The queue
+ * @priority: The minimum priority of the entry to peek
+ * @use_priority: Boolean flag whether or not to peek by priority
+ *
+ * Look for a entry in a queue, either by priority, or the oldest one (FIFO).
+ * The entry is not freed, put off the queue's lists or anything else.
+ *
+ * Return: the peeked queue entry on success, NULL if no suitable msg is found
+ */
+struct kdbus_queue_entry *kdbus_queue_peek(struct kdbus_queue *queue,
+ s64 priority, bool use_priority)
+{
+ struct kdbus_queue_entry *e;
+
+ if (list_empty(&queue->msg_list))
+ return NULL;
+
+ if (use_priority) {
+ /* get next entry with highest priority */
+ e = rb_entry(queue->msg_prio_highest,
+ struct kdbus_queue_entry, prio_node);
+
+ /* no entry with the requested priority */
+ if (e->priority > priority)
+ return NULL;
+ } else {
+ /* ignore the priority, return the next entry in the entry */
+ e = list_first_entry(&queue->msg_list,
+ struct kdbus_queue_entry, entry);
+ }
+
+ return e;
+}
+
+static void kdbus_queue_entry_link(struct kdbus_queue_entry *entry)
+{
+ struct kdbus_queue *queue = &entry->conn->queue;
+ struct rb_node **n, *pn = NULL;
+ bool highest = true;
+
+ lockdep_assert_held(&entry->conn->lock);
+ if (WARN_ON(!list_empty(&entry->entry)))
+ return;
+
+ /* sort into priority entry tree */
+ n = &queue->msg_prio_queue.rb_node;
+ while (*n) {
+ struct kdbus_queue_entry *e;
+
+ pn = *n;
+ e = rb_entry(pn, struct kdbus_queue_entry, prio_node);
+
+ /* existing node for this priority, add to its list */
+ if (likely(entry->priority == e->priority)) {
+ list_add_tail(&entry->prio_entry, &e->prio_entry);
+ goto prio_done;
+ }
+
+ if (entry->priority < e->priority) {
+ n = &pn->rb_left;
+ } else {
+ n = &pn->rb_right;
+ highest = false;
+ }
+ }
+
+ /* cache highest-priority entry */
+ if (highest)
+ queue->msg_prio_highest = &entry->prio_node;
+
+ /* new node for this priority */
+ rb_link_node(&entry->prio_node, pn, n);
+ rb_insert_color(&entry->prio_node, &queue->msg_prio_queue);
+ INIT_LIST_HEAD(&entry->prio_entry);
+
+prio_done:
+ /* add to unsorted fifo list */
+ list_add_tail(&entry->entry, &queue->msg_list);
+}
+
+static void kdbus_queue_entry_unlink(struct kdbus_queue_entry *entry)
+{
+ struct kdbus_queue *queue = &entry->conn->queue;
+
+ lockdep_assert_held(&entry->conn->lock);
+ if (list_empty(&entry->entry))
+ return;
+
+ list_del_init(&entry->entry);
+
+ if (list_empty(&entry->prio_entry)) {
+ /*
+ * Single entry for this priority, update cached
+ * highest-priority entry, remove the tree node.
+ */
+ if (queue->msg_prio_highest == &entry->prio_node)
+ queue->msg_prio_highest = rb_next(&entry->prio_node);
+
+ rb_erase(&entry->prio_node, &queue->msg_prio_queue);
+ } else {
+ struct kdbus_queue_entry *q;
+
+ /*
+ * Multiple entries for this priority entry, get next one in
+ * the list. Update cached highest-priority entry, store the
+ * new one as the tree node.
+ */
+ q = list_first_entry(&entry->prio_entry,
+ struct kdbus_queue_entry, prio_entry);
+ list_del(&entry->prio_entry);
+
+ if (queue->msg_prio_highest == &entry->prio_node)
+ queue->msg_prio_highest = &q->prio_node;
+
+ rb_replace_node(&entry->prio_node, &q->prio_node,
+ &queue->msg_prio_queue);
+ }
+}
+
+/**
+ * kdbus_queue_entry_new() - allocate a queue entry
+ * @src: source connection, or NULL
+ * @dst: destination connection
+ * @s: staging object carrying the message
+ *
+ * Allocates a queue entry based on a given msg and allocate space for
+ * the message payload and the requested metadata in the connection's pool.
+ * The entry is not actually added to the queue's lists at this point.
+ *
+ * Return: the allocated entry on success, or an ERR_PTR on failures.
+ */
+struct kdbus_queue_entry *kdbus_queue_entry_new(struct kdbus_conn *src,
+ struct kdbus_conn *dst,
+ struct kdbus_staging *s)
+{
+ struct kdbus_queue_entry *entry;
+ int ret;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&entry->entry);
+ entry->priority = s->msg->priority;
+ entry->conn = kdbus_conn_ref(dst);
+ entry->gaps = kdbus_gaps_ref(s->gaps);
+
+ entry->slice = kdbus_staging_emit(s, src, dst);
+ if (IS_ERR(entry->slice)) {
+ ret = PTR_ERR(entry->slice);
+ entry->slice = NULL;
+ goto error;
+ }
+
+ entry->user = src ? kdbus_user_ref(src->user) : NULL;
+ return entry;
+
+error:
+ kdbus_queue_entry_free(entry);
+ return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_queue_entry_free() - free resources of an entry
+ * @entry: The entry to free
+ *
+ * Removes resources allocated by a queue entry, along with the entry itself.
+ * Note that the entry's slice is not freed at this point.
+ */
+void kdbus_queue_entry_free(struct kdbus_queue_entry *entry)
+{
+ if (!entry)
+ return;
+
+ lockdep_assert_held(&entry->conn->lock);
+
+ kdbus_queue_entry_unlink(entry);
+ kdbus_reply_unref(entry->reply);
+
+ if (entry->slice) {
+ kdbus_conn_quota_dec(entry->conn, entry->user,
+ kdbus_pool_slice_size(entry->slice),
+ entry->gaps ? entry->gaps->n_fds : 0);
+ kdbus_pool_slice_release(entry->slice);
+ }
+
+ kdbus_user_unref(entry->user);
+ kdbus_gaps_unref(entry->gaps);
+ kdbus_conn_unref(entry->conn);
+ kfree(entry);
+}
+
+/**
+ * kdbus_queue_entry_install() - install message components into the
+ * receiver's process
+ * @entry: The queue entry to install
+ * @return_flags: Pointer to store the return flags for userspace
+ * @install_fds: Whether or not to install associated file descriptors
+ *
+ * Return: 0 on success.
+ */
+int kdbus_queue_entry_install(struct kdbus_queue_entry *entry,
+ u64 *return_flags, bool install_fds)
+{
+ bool incomplete_fds = false;
+ int ret;
+
+ lockdep_assert_held(&entry->conn->lock);
+
+ ret = kdbus_gaps_install(entry->gaps, entry->slice, &incomplete_fds);
+ if (ret < 0)
+ return ret;
+
+ if (incomplete_fds)
+ *return_flags |= KDBUS_RECV_RETURN_INCOMPLETE_FDS;
+ return 0;
+}
+
+/**
+ * kdbus_queue_entry_enqueue() - enqueue an entry
+ * @entry: entry to enqueue
+ * @reply: reply to link to this entry (or NULL if none)
+ *
+ * This enqueues an unqueued entry into the message queue of the linked
+ * connection. It also binds a reply object to the entry so we can remember it
+ * when the message is moved.
+ *
+ * Once this call returns (and the connection lock is released), this entry can
+ * be dequeued by the target connection. Note that the entry will not be removed
+ * from the queue until it is destroyed.
+ */
+void kdbus_queue_entry_enqueue(struct kdbus_queue_entry *entry,
+ struct kdbus_reply *reply)
+{
+ lockdep_assert_held(&entry->conn->lock);
+
+ if (WARN_ON(entry->reply) || WARN_ON(!list_empty(&entry->entry)))
+ return;
+
+ entry->reply = kdbus_reply_ref(reply);
+ kdbus_queue_entry_link(entry);
+}
+
+/**
+ * kdbus_queue_entry_move() - move queue entry
+ * @e: queue entry to move
+ * @dst: destination connection to queue the entry on
+ *
+ * This moves a queue entry onto a different connection. It allocates a new
+ * slice on the target connection and copies the message over. If the copy
+ * succeeded, we move the entry from @src to @dst.
+ *
+ * On failure, the entry is left untouched.
+ *
+ * The queue entry must be queued right now, and after the call succeeds it will
+ * be queued on the destination, but no longer on the source.
+ *
+ * The caller must hold the connection lock of the source *and* destination.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_queue_entry_move(struct kdbus_queue_entry *e,
+ struct kdbus_conn *dst)
+{
+ struct kdbus_pool_slice *slice = NULL;
+ struct kdbus_conn *src = e->conn;
+ size_t size, fds;
+ int ret;
+
+ lockdep_assert_held(&src->lock);
+ lockdep_assert_held(&dst->lock);
+
+ if (WARN_ON(list_empty(&e->entry)))
+ return -EINVAL;
+ if (src == dst)
+ return 0;
+
+ size = kdbus_pool_slice_size(e->slice);
+ fds = e->gaps ? e->gaps->n_fds : 0;
+
+ ret = kdbus_conn_quota_inc(dst, e->user, size, fds);
+ if (ret < 0)
+ return ret;
+
+ slice = kdbus_pool_slice_alloc(dst->pool, size, true);
+ if (IS_ERR(slice)) {
+ ret = PTR_ERR(slice);
+ slice = NULL;
+ goto error;
+ }
+
+ ret = kdbus_pool_slice_copy(slice, e->slice);
+ if (ret < 0)
+ goto error;
+
+ kdbus_queue_entry_unlink(e);
+ kdbus_conn_quota_dec(src, e->user, size, fds);
+ kdbus_pool_slice_release(e->slice);
+ kdbus_conn_unref(e->conn);
+
+ e->slice = slice;
+ e->conn = kdbus_conn_ref(dst);
+ kdbus_queue_entry_link(e);
+
+ return 0;
+
+error:
+ kdbus_pool_slice_release(slice);
+ kdbus_conn_quota_dec(dst, e->user, size, fds);
+ return ret;
+}
diff --git a/ipc/kdbus/queue.h b/ipc/kdbus/queue.h
new file mode 100644
index 000000000..bf686d182
--- /dev/null
+++ b/ipc/kdbus/queue.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_QUEUE_H
+#define __KDBUS_QUEUE_H
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+struct kdbus_conn;
+struct kdbus_pool_slice;
+struct kdbus_reply;
+struct kdbus_staging;
+struct kdbus_user;
+
+/**
+ * struct kdbus_queue - a connection's message queue
+ * @msg_list: List head for kdbus_queue_entry objects
+ * @msg_prio_queue: RB tree root for messages, sorted by priority
+ * @msg_prio_highest: Link to the RB node referencing the message with the
+ * highest priority in the tree.
+ */
+struct kdbus_queue {
+ struct list_head msg_list;
+ struct rb_root msg_prio_queue;
+ struct rb_node *msg_prio_highest;
+};
+
+/**
+ * struct kdbus_queue_entry - messages waiting to be read
+ * @entry: Entry in the connection's list
+ * @prio_node: Entry in the priority queue tree
+ * @prio_entry: Queue tree node entry in the list of one priority
+ * @priority: Message priority
+ * @dst_name_id: The sequence number of the name this message is
+ * addressed to, 0 for messages sent to an ID
+ * @conn: Connection this entry is queued on
+ * @gaps: Gaps object to fill message gaps at RECV time
+ * @user: User used for accounting
+ * @slice: Slice in the receiver's pool for the message
+ * @reply: The reply block if a reply to this message is expected
+ */
+struct kdbus_queue_entry {
+ struct list_head entry;
+ struct rb_node prio_node;
+ struct list_head prio_entry;
+
+ s64 priority;
+ u64 dst_name_id;
+
+ struct kdbus_conn *conn;
+ struct kdbus_gaps *gaps;
+ struct kdbus_user *user;
+ struct kdbus_pool_slice *slice;
+ struct kdbus_reply *reply;
+};
+
+void kdbus_queue_init(struct kdbus_queue *queue);
+struct kdbus_queue_entry *kdbus_queue_peek(struct kdbus_queue *queue,
+ s64 priority, bool use_priority);
+
+struct kdbus_queue_entry *kdbus_queue_entry_new(struct kdbus_conn *src,
+ struct kdbus_conn *dst,
+ struct kdbus_staging *s);
+void kdbus_queue_entry_free(struct kdbus_queue_entry *entry);
+int kdbus_queue_entry_install(struct kdbus_queue_entry *entry,
+ u64 *return_flags, bool install_fds);
+void kdbus_queue_entry_enqueue(struct kdbus_queue_entry *entry,
+ struct kdbus_reply *reply);
+int kdbus_queue_entry_move(struct kdbus_queue_entry *entry,
+ struct kdbus_conn *dst);
+
+#endif /* __KDBUS_QUEUE_H */
diff --git a/ipc/kdbus/reply.c b/ipc/kdbus/reply.c
new file mode 100644
index 000000000..e6791d86e
--- /dev/null
+++ b/ipc/kdbus/reply.c
@@ -0,0 +1,252 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "message.h"
+#include "metadata.h"
+#include "names.h"
+#include "domain.h"
+#include "item.h"
+#include "notify.h"
+#include "policy.h"
+#include "reply.h"
+#include "util.h"
+
+/**
+ * kdbus_reply_new() - Allocate and set up a new kdbus_reply object
+ * @reply_src: The connection a reply is expected from
+ * @reply_dst: The connection this reply object belongs to
+ * @msg: Message associated with the reply
+ * @name_entry: Name entry used to send the message
+ * @sync: Whether or not to make this reply synchronous
+ *
+ * Allocate and fill a new kdbus_reply object.
+ *
+ * Return: New kdbus_conn object on success, ERR_PTR on error.
+ */
+struct kdbus_reply *kdbus_reply_new(struct kdbus_conn *reply_src,
+ struct kdbus_conn *reply_dst,
+ const struct kdbus_msg *msg,
+ struct kdbus_name_entry *name_entry,
+ bool sync)
+{
+ struct kdbus_reply *r;
+ int ret;
+
+ if (atomic_inc_return(&reply_dst->request_count) >
+ KDBUS_CONN_MAX_REQUESTS_PENDING) {
+ ret = -EMLINK;
+ goto exit_dec_request_count;
+ }
+
+ r = kzalloc(sizeof(*r), GFP_KERNEL);
+ if (!r) {
+ ret = -ENOMEM;
+ goto exit_dec_request_count;
+ }
+
+ kref_init(&r->kref);
+ INIT_LIST_HEAD(&r->entry);
+ r->reply_src = kdbus_conn_ref(reply_src);
+ r->reply_dst = kdbus_conn_ref(reply_dst);
+ r->cookie = msg->cookie;
+ r->name_id = name_entry ? name_entry->name_id : 0;
+ r->deadline_ns = msg->timeout_ns;
+
+ if (sync) {
+ r->sync = true;
+ r->waiting = true;
+ }
+
+ return r;
+
+exit_dec_request_count:
+ atomic_dec(&reply_dst->request_count);
+ return ERR_PTR(ret);
+}
+
+static void __kdbus_reply_free(struct kref *kref)
+{
+ struct kdbus_reply *reply =
+ container_of(kref, struct kdbus_reply, kref);
+
+ atomic_dec(&reply->reply_dst->request_count);
+ kdbus_conn_unref(reply->reply_src);
+ kdbus_conn_unref(reply->reply_dst);
+ kfree(reply);
+}
+
+/**
+ * kdbus_reply_ref() - Increase reference on kdbus_reply
+ * @r: The reply, may be %NULL
+ *
+ * Return: The reply object with an extra reference
+ */
+struct kdbus_reply *kdbus_reply_ref(struct kdbus_reply *r)
+{
+ if (r)
+ kref_get(&r->kref);
+ return r;
+}
+
+/**
+ * kdbus_reply_unref() - Decrease reference on kdbus_reply
+ * @r: The reply, may be %NULL
+ *
+ * Return: NULL
+ */
+struct kdbus_reply *kdbus_reply_unref(struct kdbus_reply *r)
+{
+ if (r)
+ kref_put(&r->kref, __kdbus_reply_free);
+ return NULL;
+}
+
+/**
+ * kdbus_reply_link() - Link reply object into target connection
+ * @r: Reply to link
+ */
+void kdbus_reply_link(struct kdbus_reply *r)
+{
+ if (WARN_ON(!list_empty(&r->entry)))
+ return;
+
+ list_add(&r->entry, &r->reply_dst->reply_list);
+ kdbus_reply_ref(r);
+}
+
+/**
+ * kdbus_reply_unlink() - Unlink reply object from target connection
+ * @r: Reply to unlink
+ */
+void kdbus_reply_unlink(struct kdbus_reply *r)
+{
+ if (!list_empty(&r->entry)) {
+ list_del_init(&r->entry);
+ kdbus_reply_unref(r);
+ }
+}
+
+/**
+ * kdbus_sync_reply_wakeup() - Wake a synchronously blocking reply
+ * @reply: The reply object
+ * @err: Error code to set on the remote side
+ *
+ * Wake up remote peer (method origin) with the appropriate synchronous reply
+ * code.
+ */
+void kdbus_sync_reply_wakeup(struct kdbus_reply *reply, int err)
+{
+ if (WARN_ON(!reply->sync))
+ return;
+
+ reply->waiting = false;
+ reply->err = err;
+ wake_up_interruptible(&reply->reply_dst->wait);
+}
+
+/**
+ * kdbus_reply_find() - Find the corresponding reply object
+ * @replying: The replying connection or NULL
+ * @reply_dst: The connection the reply will be sent to
+ * (method origin)
+ * @cookie: The cookie of the requesting message
+ *
+ * Lookup a reply object that should be sent as a reply by
+ * @replying to @reply_dst with the given cookie.
+ *
+ * Callers must take the @reply_dst lock.
+ *
+ * Return: the corresponding reply object or NULL if not found
+ */
+struct kdbus_reply *kdbus_reply_find(struct kdbus_conn *replying,
+ struct kdbus_conn *reply_dst,
+ u64 cookie)
+{
+ struct kdbus_reply *r;
+
+ list_for_each_entry(r, &reply_dst->reply_list, entry) {
+ if (r->cookie == cookie &&
+ (!replying || r->reply_src == replying))
+ return r;
+ }
+
+ return NULL;
+}
+
+/**
+ * kdbus_reply_list_scan_work() - Worker callback to scan the replies of a
+ * connection for exceeded timeouts
+ * @work: Work struct of the connection to scan
+ *
+ * Walk the list of replies stored with a connection and look for entries
+ * that have exceeded their timeout. If such an entry is found, a timeout
+ * notification is sent to the waiting peer, and the reply is removed from
+ * the list.
+ *
+ * The work is rescheduled to the nearest timeout found during the list
+ * iteration.
+ */
+void kdbus_reply_list_scan_work(struct work_struct *work)
+{
+ struct kdbus_conn *conn =
+ container_of(work, struct kdbus_conn, work.work);
+ struct kdbus_reply *reply, *reply_tmp;
+ u64 deadline = ~0ULL;
+ u64 now;
+
+ now = ktime_get_ns();
+
+ mutex_lock(&conn->lock);
+ if (!kdbus_conn_active(conn)) {
+ mutex_unlock(&conn->lock);
+ return;
+ }
+
+ list_for_each_entry_safe(reply, reply_tmp, &conn->reply_list, entry) {
+ /*
+ * If the reply block is waiting for synchronous I/O,
+ * the timeout is handled by wait_event_*_timeout(),
+ * so we don't have to care for it here.
+ */
+ if (reply->sync && !reply->interrupted)
+ continue;
+
+ WARN_ON(reply->reply_dst != conn);
+
+ if (reply->deadline_ns > now) {
+ /* remember next timeout */
+ if (deadline > reply->deadline_ns)
+ deadline = reply->deadline_ns;
+
+ continue;
+ }
+
+ /*
+ * A zero deadline means the connection died, was
+ * cleaned up already and the notification was sent.
+ * Don't send notifications for reply trackers that were
+ * left in an interrupted syscall state.
+ */
+ if (reply->deadline_ns != 0 && !reply->interrupted)
+ kdbus_notify_reply_timeout(conn->ep->bus, conn->id,
+ reply->cookie);
+
+ kdbus_reply_unlink(reply);
+ }
+
+ /* rearm delayed work with next timeout */
+ if (deadline != ~0ULL)
+ schedule_delayed_work(&conn->work,
+ nsecs_to_jiffies(deadline - now));
+
+ mutex_unlock(&conn->lock);
+
+ kdbus_notify_flush(conn->ep->bus);
+}
diff --git a/ipc/kdbus/reply.h b/ipc/kdbus/reply.h
new file mode 100644
index 000000000..68d52321a
--- /dev/null
+++ b/ipc/kdbus/reply.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_REPLY_H
+#define __KDBUS_REPLY_H
+
+/**
+ * struct kdbus_reply - an entry of kdbus_conn's list of replies
+ * @kref: Ref-count of this object
+ * @entry: The entry of the connection's reply_list
+ * @reply_src: The connection the reply will be sent from
+ * @reply_dst: The connection the reply will be sent to
+ * @queue_entry: The queue entry item that is prepared by the replying
+ * connection
+ * @deadline_ns: The deadline of the reply, in nanoseconds
+ * @cookie: The cookie of the requesting message
+ * @name_id: ID of the well-known name the original msg was sent to
+ * @sync: The reply block is waiting for synchronous I/O
+ * @waiting: The condition to synchronously wait for
+ * @interrupted: The sync reply was left in an interrupted state
+ * @err: The error code for the synchronous reply
+ */
+struct kdbus_reply {
+ struct kref kref;
+ struct list_head entry;
+ struct kdbus_conn *reply_src;
+ struct kdbus_conn *reply_dst;
+ struct kdbus_queue_entry *queue_entry;
+ u64 deadline_ns;
+ u64 cookie;
+ u64 name_id;
+ bool sync:1;
+ bool waiting:1;
+ bool interrupted:1;
+ int err;
+};
+
+struct kdbus_reply *kdbus_reply_new(struct kdbus_conn *reply_src,
+ struct kdbus_conn *reply_dst,
+ const struct kdbus_msg *msg,
+ struct kdbus_name_entry *name_entry,
+ bool sync);
+
+struct kdbus_reply *kdbus_reply_ref(struct kdbus_reply *r);
+struct kdbus_reply *kdbus_reply_unref(struct kdbus_reply *r);
+
+void kdbus_reply_link(struct kdbus_reply *r);
+void kdbus_reply_unlink(struct kdbus_reply *r);
+
+struct kdbus_reply *kdbus_reply_find(struct kdbus_conn *replying,
+ struct kdbus_conn *reply_dst,
+ u64 cookie);
+
+void kdbus_sync_reply_wakeup(struct kdbus_reply *reply, int err);
+void kdbus_reply_list_scan_work(struct work_struct *work);
+
+#endif /* __KDBUS_REPLY_H */
diff --git a/ipc/kdbus/util.c b/ipc/kdbus/util.c
new file mode 100644
index 000000000..72b188330
--- /dev/null
+++ b/ipc/kdbus/util.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/cred.h>
+#include <linux/ctype.h>
+#include <linux/err.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/user_namespace.h>
+
+#include "limits.h"
+#include "util.h"
+
+/**
+ * kdbus_copy_from_user() - copy aligned data from user-space
+ * @dest: target buffer in kernel memory
+ * @user_ptr: user-provided source buffer
+ * @size: memory size to copy from user
+ *
+ * This copies @size bytes from @user_ptr into the kernel, just like
+ * copy_from_user() does. But we enforce an 8-byte alignment and reject any
+ * unaligned user-space pointers.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_copy_from_user(void *dest, void __user *user_ptr, size_t size)
+{
+ if (!KDBUS_IS_ALIGNED8((uintptr_t)user_ptr))
+ return -EFAULT;
+
+ if (copy_from_user(dest, user_ptr, size))
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * kdbus_verify_uid_prefix() - verify UID prefix of a user-supplied name
+ * @name: user-supplied name to verify
+ * @user_ns: user-namespace to act in
+ * @kuid: Kernel internal uid of user
+ *
+ * This verifies that the user-supplied name @name has their UID as prefix. This
+ * is the default name-spacing policy we enforce on user-supplied names for
+ * public kdbus entities like buses and endpoints.
+ *
+ * The user must supply names prefixed with "<UID>-", whereas the UID is
+ * interpreted in the user-namespace of the domain. If the user fails to supply
+ * such a prefixed name, we reject it.
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int kdbus_verify_uid_prefix(const char *name, struct user_namespace *user_ns,
+ kuid_t kuid)
+{
+ uid_t uid;
+ char prefix[16];
+
+ /*
+ * The kuid must have a mapping into the userns of the domain
+ * otherwise do not allow creation of buses nor endpoints.
+ */
+ uid = from_kuid(user_ns, kuid);
+ if (uid == (uid_t) -1)
+ return -EINVAL;
+
+ snprintf(prefix, sizeof(prefix), "%u-", uid);
+ if (strncmp(name, prefix, strlen(prefix)) != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * kdbus_sanitize_attach_flags() - Sanitize attach flags from user-space
+ * @flags: Attach flags provided by userspace
+ * @attach_flags: A pointer where to store the valid attach flags
+ *
+ * Convert attach-flags provided by user-space into a valid mask. If the mask
+ * is invalid, an error is returned. The sanitized attach flags are stored in
+ * the output parameter.
+ *
+ * Return: 0 on success, negative error on failure.
+ */
+int kdbus_sanitize_attach_flags(u64 flags, u64 *attach_flags)
+{
+ /* 'any' degrades to 'all' for compatibility */
+ if (flags == _KDBUS_ATTACH_ANY)
+ flags = _KDBUS_ATTACH_ALL;
+
+ /* reject unknown attach flags */
+ if (flags & ~_KDBUS_ATTACH_ALL)
+ return -EINVAL;
+
+ *attach_flags = flags;
+ return 0;
+}
+
+/**
+ * kdbus_kvec_set - helper utility to assemble kvec arrays
+ * @kvec: kvec entry to use
+ * @src: Source address to set in @kvec
+ * @len: Number of bytes in @src
+ * @total_len: Pointer to total length variable
+ *
+ * Set @src and @len in @kvec, and increase @total_len by @len.
+ */
+void kdbus_kvec_set(struct kvec *kvec, void *src, size_t len, u64 *total_len)
+{
+ kvec->iov_base = src;
+ kvec->iov_len = len;
+ *total_len += len;
+}
+
+static const char * const zeros = "\0\0\0\0\0\0\0";
+
+/**
+ * kdbus_kvec_pad - conditionally write a padding kvec
+ * @kvec: kvec entry to use
+ * @len: Total length used for kvec array
+ *
+ * Check if the current total byte length of the array in @len is aligned to
+ * 8 bytes. If it isn't, fill @kvec with padding information and increase @len
+ * by the number of bytes stored in @kvec.
+ *
+ * Return: the number of added padding bytes.
+ */
+size_t kdbus_kvec_pad(struct kvec *kvec, u64 *len)
+{
+ size_t pad = KDBUS_ALIGN8(*len) - *len;
+
+ if (!pad)
+ return 0;
+
+ kvec->iov_base = (void *)zeros;
+ kvec->iov_len = pad;
+
+ *len += pad;
+
+ return pad;
+}
diff --git a/ipc/kdbus/util.h b/ipc/kdbus/util.h
new file mode 100644
index 000000000..529716669
--- /dev/null
+++ b/ipc/kdbus/util.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_UTIL_H
+#define __KDBUS_UTIL_H
+
+#include <linux/dcache.h>
+#include <linux/ioctl.h>
+
+#include <uapi/linux/kdbus.h>
+
+/* all exported addresses are 64 bit */
+#define KDBUS_PTR(addr) ((void __user *)(uintptr_t)(addr))
+
+/* all exported sizes are 64 bit and data aligned to 64 bit */
+#define KDBUS_ALIGN8(s) ALIGN((s), 8)
+#define KDBUS_IS_ALIGNED8(s) (IS_ALIGNED(s, 8))
+
+/**
+ * kdbus_member_set_user - write a structure member to user memory
+ * @_s: Variable to copy from
+ * @_b: Buffer to write to
+ * @_t: Structure type
+ * @_m: Member name in the passed structure
+ *
+ * Return: the result of copy_to_user()
+ */
+#define kdbus_member_set_user(_s, _b, _t, _m) \
+({ \
+ u64 __user *_sz = \
+ (void __user *)((u8 __user *)(_b) + offsetof(_t, _m)); \
+ copy_to_user(_sz, _s, FIELD_SIZEOF(_t, _m)); \
+})
+
+/**
+ * kdbus_strhash - calculate a hash
+ * @str: String
+ *
+ * Return: hash value
+ */
+static inline unsigned int kdbus_strhash(const char *str)
+{
+ unsigned long hash = init_name_hash();
+
+ while (*str)
+ hash = partial_name_hash(*str++, hash);
+
+ return end_name_hash(hash);
+}
+
+int kdbus_verify_uid_prefix(const char *name, struct user_namespace *user_ns,
+ kuid_t kuid);
+int kdbus_sanitize_attach_flags(u64 flags, u64 *attach_flags);
+
+int kdbus_copy_from_user(void *dest, void __user *user_ptr, size_t size);
+
+struct kvec;
+
+void kdbus_kvec_set(struct kvec *kvec, void *src, size_t len, u64 *total_len);
+size_t kdbus_kvec_pad(struct kvec *kvec, u64 *len);
+
+#endif
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
new file mode 100644
index 000000000..68d4e9537
--- /dev/null
+++ b/ipc/mq_sysctl.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2007 IBM Corporation
+ *
+ * Author: Cedric Le Goater <clg@fr.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+#include <linux/sysctl.h>
+
+#ifdef CONFIG_PROC_SYSCTL
+static void *get_mq(struct ctl_table *table)
+{
+ char *which = table->data;
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+ which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+ return which;
+}
+
+static int proc_mq_dointvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table mq_table;
+ memcpy(&mq_table, table, sizeof(mq_table));
+ mq_table.data = get_mq(table);
+
+ return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
+}
+
+static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table mq_table;
+ memcpy(&mq_table, table, sizeof(mq_table));
+ mq_table.data = get_mq(table);
+
+ return proc_dointvec_minmax(&mq_table, write, buffer,
+ lenp, ppos);
+}
+#else
+#define proc_mq_dointvec NULL
+#define proc_mq_dointvec_minmax NULL
+#endif
+
+static int msg_max_limit_min = MIN_MSGMAX;
+static int msg_max_limit_max = HARD_MSGMAX;
+
+static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
+static int msg_maxsize_limit_max = HARD_MSGSIZEMAX;
+
+static struct ctl_table mq_sysctls[] = {
+ {
+ .procname = "queues_max",
+ .data = &init_ipc_ns.mq_queues_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_mq_dointvec,
+ },
+ {
+ .procname = "msg_max",
+ .data = &init_ipc_ns.mq_msg_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_mq_dointvec_minmax,
+ .extra1 = &msg_max_limit_min,
+ .extra2 = &msg_max_limit_max,
+ },
+ {
+ .procname = "msgsize_max",
+ .data = &init_ipc_ns.mq_msgsize_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_mq_dointvec_minmax,
+ .extra1 = &msg_maxsize_limit_min,
+ .extra2 = &msg_maxsize_limit_max,
+ },
+ {
+ .procname = "msg_default",
+ .data = &init_ipc_ns.mq_msg_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_mq_dointvec_minmax,
+ .extra1 = &msg_max_limit_min,
+ .extra2 = &msg_max_limit_max,
+ },
+ {
+ .procname = "msgsize_default",
+ .data = &init_ipc_ns.mq_msgsize_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_mq_dointvec_minmax,
+ .extra1 = &msg_maxsize_limit_min,
+ .extra2 = &msg_maxsize_limit_max,
+ },
+ {}
+};
+
+static struct ctl_table mq_sysctl_dir[] = {
+ {
+ .procname = "mqueue",
+ .mode = 0555,
+ .child = mq_sysctls,
+ },
+ {}
+};
+
+static struct ctl_table mq_sysctl_root[] = {
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = mq_sysctl_dir,
+ },
+ {}
+};
+
+struct ctl_table_header *mq_register_sysctl_table(void)
+{
+ return register_sysctl_table(mq_sysctl_root);
+}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
new file mode 100644
index 000000000..3aaea7ffd
--- /dev/null
+++ b/ipc/mqueue.c
@@ -0,0 +1,1462 @@
+/*
+ * POSIX message queues filesystem for Linux.
+ *
+ * Copyright (C) 2003,2004 Krzysztof Benedyczak (golbi@mat.uni.torun.pl)
+ * Michal Wronski (michal.wronski@gmail.com)
+ *
+ * Spinlocks: Mohamed Abbas (abbas.mohamed@intel.com)
+ * Lockless receive & send, fd based notify:
+ * Manfred Spraul (manfred@colorfullife.com)
+ *
+ * Audit: George Wilson (ltcgcw@us.ibm.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/capability.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sysctl.h>
+#include <linux/poll.h>
+#include <linux/mqueue.h>
+#include <linux/msg.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/netlink.h>
+#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/signal.h>
+#include <linux/mutex.h>
+#include <linux/nsproxy.h>
+#include <linux/pid.h>
+#include <linux/ipc_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include "util.h"
+
+#define MQUEUE_MAGIC 0x19800202
+#define DIRENT_SIZE 20
+#define FILENT_SIZE 80
+
+#define SEND 0
+#define RECV 1
+
+#define STATE_NONE 0
+#define STATE_PENDING 1
+#define STATE_READY 2
+
+struct posix_msg_tree_node {
+ struct rb_node rb_node;
+ struct list_head msg_list;
+ int priority;
+};
+
+struct ext_wait_queue { /* queue of sleeping tasks */
+ struct task_struct *task;
+ struct list_head list;
+ struct msg_msg *msg; /* ptr of loaded message */
+ int state; /* one of STATE_* values */
+};
+
+struct mqueue_inode_info {
+ spinlock_t lock;
+ struct inode vfs_inode;
+ wait_queue_head_t wait_q;
+
+ struct rb_root msg_tree;
+ struct posix_msg_tree_node *node_cache;
+ struct mq_attr attr;
+
+ struct sigevent notify;
+ struct pid *notify_owner;
+ struct user_namespace *notify_user_ns;
+ struct user_struct *user; /* user who created, for accounting */
+ struct sock *notify_sock;
+ struct sk_buff *notify_cookie;
+
+ /* for tasks waiting for free space and messages, respectively */
+ struct ext_wait_queue e_wait_q[2];
+
+ unsigned long qsize; /* size of queue in memory (sum of all msgs) */
+};
+
+static const struct inode_operations mqueue_dir_inode_operations;
+static const struct file_operations mqueue_file_operations;
+static const struct super_operations mqueue_super_ops;
+static void remove_notification(struct mqueue_inode_info *info);
+
+static struct kmem_cache *mqueue_inode_cachep;
+
+static struct ctl_table_header *mq_sysctl_table;
+
+static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
+{
+ return container_of(inode, struct mqueue_inode_info, vfs_inode);
+}
+
+/*
+ * This routine should be called with the mq_lock held.
+ */
+static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
+{
+ return get_ipc_ns(inode->i_sb->s_fs_info);
+}
+
+static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
+{
+ struct ipc_namespace *ns;
+
+ spin_lock(&mq_lock);
+ ns = __get_ns_from_inode(inode);
+ spin_unlock(&mq_lock);
+ return ns;
+}
+
+/* Auxiliary functions to manipulate messages' list */
+static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
+{
+ struct rb_node **p, *parent = NULL;
+ struct posix_msg_tree_node *leaf;
+
+ p = &info->msg_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
+
+ if (likely(leaf->priority == msg->m_type))
+ goto insert_msg;
+ else if (msg->m_type < leaf->priority)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+ if (info->node_cache) {
+ leaf = info->node_cache;
+ info->node_cache = NULL;
+ } else {
+ leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
+ if (!leaf)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&leaf->msg_list);
+ info->qsize += sizeof(*leaf);
+ }
+ leaf->priority = msg->m_type;
+ rb_link_node(&leaf->rb_node, parent, p);
+ rb_insert_color(&leaf->rb_node, &info->msg_tree);
+insert_msg:
+ info->attr.mq_curmsgs++;
+ info->qsize += msg->m_ts;
+ list_add_tail(&msg->m_list, &leaf->msg_list);
+ return 0;
+}
+
+static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
+{
+ struct rb_node **p, *parent = NULL;
+ struct posix_msg_tree_node *leaf;
+ struct msg_msg *msg;
+
+try_again:
+ p = &info->msg_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ /*
+ * During insert, low priorities go to the left and high to the
+ * right. On receive, we want the highest priorities first, so
+ * walk all the way to the right.
+ */
+ p = &(*p)->rb_right;
+ }
+ if (!parent) {
+ if (info->attr.mq_curmsgs) {
+ pr_warn_once("Inconsistency in POSIX message queue, "
+ "no tree element, but supposedly messages "
+ "should exist!\n");
+ info->attr.mq_curmsgs = 0;
+ }
+ return NULL;
+ }
+ leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
+ if (unlikely(list_empty(&leaf->msg_list))) {
+ pr_warn_once("Inconsistency in POSIX message queue, "
+ "empty leaf node but we haven't implemented "
+ "lazy leaf delete!\n");
+ rb_erase(&leaf->rb_node, &info->msg_tree);
+ if (info->node_cache) {
+ info->qsize -= sizeof(*leaf);
+ kfree(leaf);
+ } else {
+ info->node_cache = leaf;
+ }
+ goto try_again;
+ } else {
+ msg = list_first_entry(&leaf->msg_list,
+ struct msg_msg, m_list);
+ list_del(&msg->m_list);
+ if (list_empty(&leaf->msg_list)) {
+ rb_erase(&leaf->rb_node, &info->msg_tree);
+ if (info->node_cache) {
+ info->qsize -= sizeof(*leaf);
+ kfree(leaf);
+ } else {
+ info->node_cache = leaf;
+ }
+ }
+ }
+ info->attr.mq_curmsgs--;
+ info->qsize -= msg->m_ts;
+ return msg;
+}
+
+static struct inode *mqueue_get_inode(struct super_block *sb,
+ struct ipc_namespace *ipc_ns, umode_t mode,
+ struct mq_attr *attr)
+{
+ struct user_struct *u = current_user();
+ struct inode *inode;
+ int ret = -ENOMEM;
+
+ inode = new_inode(sb);
+ if (!inode)
+ goto err;
+
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_mtime = inode->i_ctime = inode->i_atime = CURRENT_TIME;
+
+ if (S_ISREG(mode)) {
+ struct mqueue_inode_info *info;
+ unsigned long mq_bytes, mq_treesize;
+
+ inode->i_fop = &mqueue_file_operations;
+ inode->i_size = FILENT_SIZE;
+ /* mqueue specific info */
+ info = MQUEUE_I(inode);
+ spin_lock_init(&info->lock);
+ init_waitqueue_head(&info->wait_q);
+ INIT_LIST_HEAD(&info->e_wait_q[0].list);
+ INIT_LIST_HEAD(&info->e_wait_q[1].list);
+ info->notify_owner = NULL;
+ info->notify_user_ns = NULL;
+ info->qsize = 0;
+ info->user = NULL; /* set when all is ok */
+ info->msg_tree = RB_ROOT;
+ info->node_cache = NULL;
+ memset(&info->attr, 0, sizeof(info->attr));
+ info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
+ ipc_ns->mq_msg_default);
+ info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
+ ipc_ns->mq_msgsize_default);
+ if (attr) {
+ info->attr.mq_maxmsg = attr->mq_maxmsg;
+ info->attr.mq_msgsize = attr->mq_msgsize;
+ }
+ /*
+ * We used to allocate a static array of pointers and account
+ * the size of that array as well as one msg_msg struct per
+ * possible message into the queue size. That's no longer
+ * accurate as the queue is now an rbtree and will grow and
+ * shrink depending on usage patterns. We can, however, still
+ * account one msg_msg struct per message, but the nodes are
+ * allocated depending on priority usage, and most programs
+ * only use one, or a handful, of priorities. However, since
+ * this is pinned memory, we need to assume worst case, so
+ * that means the min(mq_maxmsg, max_priorities) * struct
+ * posix_msg_tree_node.
+ */
+ mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
+ min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
+ sizeof(struct posix_msg_tree_node);
+
+ mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
+ info->attr.mq_msgsize);
+
+ spin_lock(&mq_lock);
+ if (u->mq_bytes + mq_bytes < u->mq_bytes ||
+ u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
+ spin_unlock(&mq_lock);
+ /* mqueue_evict_inode() releases info->messages */
+ ret = -EMFILE;
+ goto out_inode;
+ }
+ u->mq_bytes += mq_bytes;
+ spin_unlock(&mq_lock);
+
+ /* all is ok */
+ info->user = get_uid(u);
+ } else if (S_ISDIR(mode)) {
+ inc_nlink(inode);
+ /* Some things misbehave if size == 0 on a directory */
+ inode->i_size = 2 * DIRENT_SIZE;
+ inode->i_op = &mqueue_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ }
+
+ return inode;
+out_inode:
+ iput(inode);
+err:
+ return ERR_PTR(ret);
+}
+
+static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct inode *inode;
+ struct ipc_namespace *ns = data;
+
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = MQUEUE_MAGIC;
+ sb->s_op = &mqueue_super_ops;
+
+ inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root)
+ return -ENOMEM;
+ return 0;
+}
+
+static struct dentry *mqueue_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
+{
+ if (!(flags & MS_KERNMOUNT)) {
+ struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+ /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+ * over the ipc namespace.
+ */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ data = ns;
+ }
+ return mount_ns(fs_type, flags, data, mqueue_fill_super);
+}
+
+static void init_once(void *foo)
+{
+ struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
+
+ inode_init_once(&p->vfs_inode);
+}
+
+static struct inode *mqueue_alloc_inode(struct super_block *sb)
+{
+ struct mqueue_inode_info *ei;
+
+ ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL);
+ if (!ei)
+ return NULL;
+ return &ei->vfs_inode;
+}
+
+static void mqueue_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
+}
+
+static void mqueue_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, mqueue_i_callback);
+}
+
+static void mqueue_evict_inode(struct inode *inode)
+{
+ struct mqueue_inode_info *info;
+ struct user_struct *user;
+ unsigned long mq_bytes, mq_treesize;
+ struct ipc_namespace *ipc_ns;
+ struct msg_msg *msg;
+
+ clear_inode(inode);
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ ipc_ns = get_ns_from_inode(inode);
+ info = MQUEUE_I(inode);
+ spin_lock(&info->lock);
+ while ((msg = msg_get(info)) != NULL)
+ free_msg(msg);
+ kfree(info->node_cache);
+ spin_unlock(&info->lock);
+
+ /* Total amount of bytes accounted for the mqueue */
+ mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
+ min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
+ sizeof(struct posix_msg_tree_node);
+
+ mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
+ info->attr.mq_msgsize);
+
+ user = info->user;
+ if (user) {
+ spin_lock(&mq_lock);
+ user->mq_bytes -= mq_bytes;
+ /*
+ * get_ns_from_inode() ensures that the
+ * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
+ * to which we now hold a reference, or it is NULL.
+ * We can't put it here under mq_lock, though.
+ */
+ if (ipc_ns)
+ ipc_ns->mq_queues_count--;
+ spin_unlock(&mq_lock);
+ free_uid(user);
+ }
+ if (ipc_ns)
+ put_ipc_ns(ipc_ns);
+}
+
+static int mqueue_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
+{
+ struct inode *inode;
+ struct mq_attr *attr = dentry->d_fsdata;
+ int error;
+ struct ipc_namespace *ipc_ns;
+
+ spin_lock(&mq_lock);
+ ipc_ns = __get_ns_from_inode(dir);
+ if (!ipc_ns) {
+ error = -EACCES;
+ goto out_unlock;
+ }
+
+ if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
+ !capable(CAP_SYS_RESOURCE)) {
+ error = -ENOSPC;
+ goto out_unlock;
+ }
+ ipc_ns->mq_queues_count++;
+ spin_unlock(&mq_lock);
+
+ inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
+ spin_lock(&mq_lock);
+ ipc_ns->mq_queues_count--;
+ goto out_unlock;
+ }
+
+ put_ipc_ns(ipc_ns);
+ dir->i_size += DIRENT_SIZE;
+ dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
+
+ d_instantiate(dentry, inode);
+ dget(dentry);
+ return 0;
+out_unlock:
+ spin_unlock(&mq_lock);
+ if (ipc_ns)
+ put_ipc_ns(ipc_ns);
+ return error;
+}
+
+static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
+ dir->i_size -= DIRENT_SIZE;
+ drop_nlink(inode);
+ dput(dentry);
+ return 0;
+}
+
+/*
+* This is routine for system read from queue file.
+* To avoid mess with doing here some sort of mq_receive we allow
+* to read only queue size & notification info (the only values
+* that are interesting from user point of view and aren't accessible
+* through std routines)
+*/
+static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
+ size_t count, loff_t *off)
+{
+ struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+ char buffer[FILENT_SIZE];
+ ssize_t ret;
+
+ spin_lock(&info->lock);
+ snprintf(buffer, sizeof(buffer),
+ "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
+ info->qsize,
+ info->notify_owner ? info->notify.sigev_notify : 0,
+ (info->notify_owner &&
+ info->notify.sigev_notify == SIGEV_SIGNAL) ?
+ info->notify.sigev_signo : 0,
+ pid_vnr(info->notify_owner));
+ spin_unlock(&info->lock);
+ buffer[sizeof(buffer)-1] = '\0';
+
+ ret = simple_read_from_buffer(u_data, count, off, buffer,
+ strlen(buffer));
+ if (ret <= 0)
+ return ret;
+
+ file_inode(filp)->i_atime = file_inode(filp)->i_ctime = CURRENT_TIME;
+ return ret;
+}
+
+static int mqueue_flush_file(struct file *filp, fl_owner_t id)
+{
+ struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+
+ spin_lock(&info->lock);
+ if (task_tgid(current) == info->notify_owner)
+ remove_notification(info);
+
+ spin_unlock(&info->lock);
+ return 0;
+}
+
+static unsigned int mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
+{
+ struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+ int retval = 0;
+
+ poll_wait(filp, &info->wait_q, poll_tab);
+
+ spin_lock(&info->lock);
+ if (info->attr.mq_curmsgs)
+ retval = POLLIN | POLLRDNORM;
+
+ if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
+ retval |= POLLOUT | POLLWRNORM;
+ spin_unlock(&info->lock);
+
+ return retval;
+}
+
+/* Adds current to info->e_wait_q[sr] before element with smaller prio */
+static void wq_add(struct mqueue_inode_info *info, int sr,
+ struct ext_wait_queue *ewp)
+{
+ struct ext_wait_queue *walk;
+
+ ewp->task = current;
+
+ list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
+ if (walk->task->static_prio <= current->static_prio) {
+ list_add_tail(&ewp->list, &walk->list);
+ return;
+ }
+ }
+ list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
+}
+
+/*
+ * Puts current task to sleep. Caller must hold queue lock. After return
+ * lock isn't held.
+ * sr: SEND or RECV
+ */
+static int wq_sleep(struct mqueue_inode_info *info, int sr,
+ ktime_t *timeout, struct ext_wait_queue *ewp)
+{
+ int retval;
+ signed long time;
+
+ wq_add(info, sr, ewp);
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ spin_unlock(&info->lock);
+ time = schedule_hrtimeout_range_clock(timeout, 0,
+ HRTIMER_MODE_ABS, CLOCK_REALTIME);
+
+ while (ewp->state == STATE_PENDING)
+ cpu_relax();
+
+ if (ewp->state == STATE_READY) {
+ retval = 0;
+ goto out;
+ }
+ spin_lock(&info->lock);
+ if (ewp->state == STATE_READY) {
+ retval = 0;
+ goto out_unlock;
+ }
+ if (signal_pending(current)) {
+ retval = -ERESTARTSYS;
+ break;
+ }
+ if (time == 0) {
+ retval = -ETIMEDOUT;
+ break;
+ }
+ }
+ list_del(&ewp->list);
+out_unlock:
+ spin_unlock(&info->lock);
+out:
+ return retval;
+}
+
+/*
+ * Returns waiting task that should be serviced first or NULL if none exists
+ */
+static struct ext_wait_queue *wq_get_first_waiter(
+ struct mqueue_inode_info *info, int sr)
+{
+ struct list_head *ptr;
+
+ ptr = info->e_wait_q[sr].list.prev;
+ if (ptr == &info->e_wait_q[sr].list)
+ return NULL;
+ return list_entry(ptr, struct ext_wait_queue, list);
+}
+
+
+static inline void set_cookie(struct sk_buff *skb, char code)
+{
+ ((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
+}
+
+/*
+ * The next function is only to split too long sys_mq_timedsend
+ */
+static void __do_notify(struct mqueue_inode_info *info)
+{
+ /* notification
+ * invoked when there is registered process and there isn't process
+ * waiting synchronously for message AND state of queue changed from
+ * empty to not empty. Here we are sure that no one is waiting
+ * synchronously. */
+ if (info->notify_owner &&
+ info->attr.mq_curmsgs == 1) {
+ struct siginfo sig_i;
+ switch (info->notify.sigev_notify) {
+ case SIGEV_NONE:
+ break;
+ case SIGEV_SIGNAL:
+ /* sends signal */
+
+ sig_i.si_signo = info->notify.sigev_signo;
+ sig_i.si_errno = 0;
+ sig_i.si_code = SI_MESGQ;
+ sig_i.si_value = info->notify.sigev_value;
+ /* map current pid/uid into info->owner's namespaces */
+ rcu_read_lock();
+ sig_i.si_pid = task_tgid_nr_ns(current,
+ ns_of_pid(info->notify_owner));
+ sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
+ rcu_read_unlock();
+
+ kill_pid_info(info->notify.sigev_signo,
+ &sig_i, info->notify_owner);
+ break;
+ case SIGEV_THREAD:
+ set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
+ netlink_sendskb(info->notify_sock, info->notify_cookie);
+ break;
+ }
+ /* after notification unregisters process */
+ put_pid(info->notify_owner);
+ put_user_ns(info->notify_user_ns);
+ info->notify_owner = NULL;
+ info->notify_user_ns = NULL;
+ }
+ wake_up(&info->wait_q);
+}
+
+static int prepare_timeout(const struct timespec __user *u_abs_timeout,
+ ktime_t *expires, struct timespec *ts)
+{
+ if (copy_from_user(ts, u_abs_timeout, sizeof(struct timespec)))
+ return -EFAULT;
+ if (!timespec_valid(ts))
+ return -EINVAL;
+
+ *expires = timespec_to_ktime(*ts);
+ return 0;
+}
+
+static void remove_notification(struct mqueue_inode_info *info)
+{
+ if (info->notify_owner != NULL &&
+ info->notify.sigev_notify == SIGEV_THREAD) {
+ set_cookie(info->notify_cookie, NOTIFY_REMOVED);
+ netlink_sendskb(info->notify_sock, info->notify_cookie);
+ }
+ put_pid(info->notify_owner);
+ put_user_ns(info->notify_user_ns);
+ info->notify_owner = NULL;
+ info->notify_user_ns = NULL;
+}
+
+static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
+{
+ int mq_treesize;
+ unsigned long total_size;
+
+ if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
+ return -EINVAL;
+ if (capable(CAP_SYS_RESOURCE)) {
+ if (attr->mq_maxmsg > HARD_MSGMAX ||
+ attr->mq_msgsize > HARD_MSGSIZEMAX)
+ return -EINVAL;
+ } else {
+ if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||
+ attr->mq_msgsize > ipc_ns->mq_msgsize_max)
+ return -EINVAL;
+ }
+ /* check for overflow */
+ if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg)
+ return -EOVERFLOW;
+ mq_treesize = attr->mq_maxmsg * sizeof(struct msg_msg) +
+ min_t(unsigned int, attr->mq_maxmsg, MQ_PRIO_MAX) *
+ sizeof(struct posix_msg_tree_node);
+ total_size = attr->mq_maxmsg * attr->mq_msgsize;
+ if (total_size + mq_treesize < total_size)
+ return -EOVERFLOW;
+ return 0;
+}
+
+/*
+ * Invoked when creating a new queue via sys_mq_open
+ */
+static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
+ struct path *path, int oflag, umode_t mode,
+ struct mq_attr *attr)
+{
+ const struct cred *cred = current_cred();
+ int ret;
+
+ if (attr) {
+ ret = mq_attr_ok(ipc_ns, attr);
+ if (ret)
+ return ERR_PTR(ret);
+ /* store for use during create */
+ path->dentry->d_fsdata = attr;
+ } else {
+ struct mq_attr def_attr;
+
+ def_attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
+ ipc_ns->mq_msg_default);
+ def_attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
+ ipc_ns->mq_msgsize_default);
+ ret = mq_attr_ok(ipc_ns, &def_attr);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ mode &= ~current_umask();
+ ret = vfs_create(dir, path->dentry, mode, true);
+ path->dentry->d_fsdata = NULL;
+ if (ret)
+ return ERR_PTR(ret);
+ return dentry_open(path, oflag, cred);
+}
+
+/* Opens existing queue */
+static struct file *do_open(struct path *path, int oflag)
+{
+ static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
+ MAY_READ | MAY_WRITE };
+ int acc;
+ if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
+ return ERR_PTR(-EINVAL);
+ acc = oflag2acc[oflag & O_ACCMODE];
+ if (inode_permission(d_inode(path->dentry), acc))
+ return ERR_PTR(-EACCES);
+ return dentry_open(path, oflag, current_cred());
+}
+
+SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
+ struct mq_attr __user *, u_attr)
+{
+ struct path path;
+ struct file *filp;
+ struct filename *name;
+ struct mq_attr attr;
+ int fd, error;
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+ struct vfsmount *mnt = ipc_ns->mq_mnt;
+ struct dentry *root = mnt->mnt_root;
+ int ro;
+
+ if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
+ return -EFAULT;
+
+ audit_mq_open(oflag, mode, u_attr ? &attr : NULL);
+
+ if (IS_ERR(name = getname(u_name)))
+ return PTR_ERR(name);
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ goto out_putname;
+
+ ro = mnt_want_write(mnt); /* we'll drop it in any case */
+ error = 0;
+ mutex_lock(&d_inode(root)->i_mutex);
+ path.dentry = lookup_one_len(name->name, root, strlen(name->name));
+ if (IS_ERR(path.dentry)) {
+ error = PTR_ERR(path.dentry);
+ goto out_putfd;
+ }
+ path.mnt = mntget(mnt);
+
+ if (oflag & O_CREAT) {
+ if (d_really_is_positive(path.dentry)) { /* entry already exists */
+ audit_inode(name, path.dentry, 0);
+ if (oflag & O_EXCL) {
+ error = -EEXIST;
+ goto out;
+ }
+ filp = do_open(&path, oflag);
+ } else {
+ if (ro) {
+ error = ro;
+ goto out;
+ }
+ audit_inode_parent_hidden(name, root);
+ filp = do_create(ipc_ns, d_inode(root),
+ &path, oflag, mode,
+ u_attr ? &attr : NULL);
+ }
+ } else {
+ if (d_really_is_negative(path.dentry)) {
+ error = -ENOENT;
+ goto out;
+ }
+ audit_inode(name, path.dentry, 0);
+ filp = do_open(&path, oflag);
+ }
+
+ if (!IS_ERR(filp))
+ fd_install(fd, filp);
+ else
+ error = PTR_ERR(filp);
+out:
+ path_put(&path);
+out_putfd:
+ if (error) {
+ put_unused_fd(fd);
+ fd = error;
+ }
+ mutex_unlock(&d_inode(root)->i_mutex);
+ if (!ro)
+ mnt_drop_write(mnt);
+out_putname:
+ putname(name);
+ return fd;
+}
+
+SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
+{
+ int err;
+ struct filename *name;
+ struct dentry *dentry;
+ struct inode *inode = NULL;
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+ struct vfsmount *mnt = ipc_ns->mq_mnt;
+
+ name = getname(u_name);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ audit_inode_parent_hidden(name, mnt->mnt_root);
+ err = mnt_want_write(mnt);
+ if (err)
+ goto out_name;
+ mutex_lock_nested(&d_inode(mnt->mnt_root)->i_mutex, I_MUTEX_PARENT);
+ dentry = lookup_one_len(name->name, mnt->mnt_root,
+ strlen(name->name));
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_unlock;
+ }
+
+ inode = d_inode(dentry);
+ if (!inode) {
+ err = -ENOENT;
+ } else {
+ ihold(inode);
+ err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
+ }
+ dput(dentry);
+
+out_unlock:
+ mutex_unlock(&d_inode(mnt->mnt_root)->i_mutex);
+ if (inode)
+ iput(inode);
+ mnt_drop_write(mnt);
+out_name:
+ putname(name);
+
+ return err;
+}
+
+/* Pipelined send and receive functions.
+ *
+ * If a receiver finds no waiting message, then it registers itself in the
+ * list of waiting receivers. A sender checks that list before adding the new
+ * message into the message array. If there is a waiting receiver, then it
+ * bypasses the message array and directly hands the message over to the
+ * receiver.
+ * The receiver accepts the message and returns without grabbing the queue
+ * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
+ * are necessary. The same algorithm is used for sysv semaphores, see
+ * ipc/sem.c for more details.
+ *
+ * The same algorithm is used for senders.
+ */
+
+/* pipelined_send() - send a message directly to the task waiting in
+ * sys_mq_timedreceive() (without inserting message into a queue).
+ */
+static inline void pipelined_send(struct mqueue_inode_info *info,
+ struct msg_msg *message,
+ struct ext_wait_queue *receiver)
+{
+ receiver->msg = message;
+ list_del(&receiver->list);
+ receiver->state = STATE_PENDING;
+ wake_up_process(receiver->task);
+ smp_wmb();
+ receiver->state = STATE_READY;
+}
+
+/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
+ * gets its message and put to the queue (we have one free place for sure). */
+static inline void pipelined_receive(struct mqueue_inode_info *info)
+{
+ struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
+
+ if (!sender) {
+ /* for poll */
+ wake_up_interruptible(&info->wait_q);
+ return;
+ }
+ if (msg_insert(sender->msg, info))
+ return;
+ list_del(&sender->list);
+ sender->state = STATE_PENDING;
+ wake_up_process(sender->task);
+ smp_wmb();
+ sender->state = STATE_READY;
+}
+
+SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
+ size_t, msg_len, unsigned int, msg_prio,
+ const struct timespec __user *, u_abs_timeout)
+{
+ struct fd f;
+ struct inode *inode;
+ struct ext_wait_queue wait;
+ struct ext_wait_queue *receiver;
+ struct msg_msg *msg_ptr;
+ struct mqueue_inode_info *info;
+ ktime_t expires, *timeout = NULL;
+ struct timespec ts;
+ struct posix_msg_tree_node *new_leaf = NULL;
+ int ret = 0;
+
+ if (u_abs_timeout) {
+ int res = prepare_timeout(u_abs_timeout, &expires, &ts);
+ if (res)
+ return res;
+ timeout = &expires;
+ }
+
+ if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
+ return -EINVAL;
+
+ audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL);
+
+ f = fdget(mqdes);
+ if (unlikely(!f.file)) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ inode = file_inode(f.file);
+ if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+ info = MQUEUE_I(inode);
+ audit_file(f.file);
+
+ if (unlikely(!(f.file->f_mode & FMODE_WRITE))) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+
+ if (unlikely(msg_len > info->attr.mq_msgsize)) {
+ ret = -EMSGSIZE;
+ goto out_fput;
+ }
+
+ /* First try to allocate memory, before doing anything with
+ * existing queues. */
+ msg_ptr = load_msg(u_msg_ptr, msg_len);
+ if (IS_ERR(msg_ptr)) {
+ ret = PTR_ERR(msg_ptr);
+ goto out_fput;
+ }
+ msg_ptr->m_ts = msg_len;
+ msg_ptr->m_type = msg_prio;
+
+ /*
+ * msg_insert really wants us to have a valid, spare node struct so
+ * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
+ * fall back to that if necessary.
+ */
+ if (!info->node_cache)
+ new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
+
+ spin_lock(&info->lock);
+
+ if (!info->node_cache && new_leaf) {
+ /* Save our speculative allocation into the cache */
+ INIT_LIST_HEAD(&new_leaf->msg_list);
+ info->node_cache = new_leaf;
+ info->qsize += sizeof(*new_leaf);
+ new_leaf = NULL;
+ } else {
+ kfree(new_leaf);
+ }
+
+ if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
+ if (f.file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ } else {
+ wait.task = current;
+ wait.msg = (void *) msg_ptr;
+ wait.state = STATE_NONE;
+ ret = wq_sleep(info, SEND, timeout, &wait);
+ /*
+ * wq_sleep must be called with info->lock held, and
+ * returns with the lock released
+ */
+ goto out_free;
+ }
+ } else {
+ receiver = wq_get_first_waiter(info, RECV);
+ if (receiver) {
+ pipelined_send(info, msg_ptr, receiver);
+ } else {
+ /* adds message to the queue */
+ ret = msg_insert(msg_ptr, info);
+ if (ret)
+ goto out_unlock;
+ __do_notify(info);
+ }
+ inode->i_atime = inode->i_mtime = inode->i_ctime =
+ CURRENT_TIME;
+ }
+out_unlock:
+ spin_unlock(&info->lock);
+out_free:
+ if (ret)
+ free_msg(msg_ptr);
+out_fput:
+ fdput(f);
+out:
+ return ret;
+}
+
+SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
+ size_t, msg_len, unsigned int __user *, u_msg_prio,
+ const struct timespec __user *, u_abs_timeout)
+{
+ ssize_t ret;
+ struct msg_msg *msg_ptr;
+ struct fd f;
+ struct inode *inode;
+ struct mqueue_inode_info *info;
+ struct ext_wait_queue wait;
+ ktime_t expires, *timeout = NULL;
+ struct timespec ts;
+ struct posix_msg_tree_node *new_leaf = NULL;
+
+ if (u_abs_timeout) {
+ int res = prepare_timeout(u_abs_timeout, &expires, &ts);
+ if (res)
+ return res;
+ timeout = &expires;
+ }
+
+ audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL);
+
+ f = fdget(mqdes);
+ if (unlikely(!f.file)) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ inode = file_inode(f.file);
+ if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+ info = MQUEUE_I(inode);
+ audit_file(f.file);
+
+ if (unlikely(!(f.file->f_mode & FMODE_READ))) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+
+ /* checks if buffer is big enough */
+ if (unlikely(msg_len < info->attr.mq_msgsize)) {
+ ret = -EMSGSIZE;
+ goto out_fput;
+ }
+
+ /*
+ * msg_insert really wants us to have a valid, spare node struct so
+ * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
+ * fall back to that if necessary.
+ */
+ if (!info->node_cache)
+ new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
+
+ spin_lock(&info->lock);
+
+ if (!info->node_cache && new_leaf) {
+ /* Save our speculative allocation into the cache */
+ INIT_LIST_HEAD(&new_leaf->msg_list);
+ info->node_cache = new_leaf;
+ info->qsize += sizeof(*new_leaf);
+ } else {
+ kfree(new_leaf);
+ }
+
+ if (info->attr.mq_curmsgs == 0) {
+ if (f.file->f_flags & O_NONBLOCK) {
+ spin_unlock(&info->lock);
+ ret = -EAGAIN;
+ } else {
+ wait.task = current;
+ wait.state = STATE_NONE;
+ ret = wq_sleep(info, RECV, timeout, &wait);
+ msg_ptr = wait.msg;
+ }
+ } else {
+ msg_ptr = msg_get(info);
+
+ inode->i_atime = inode->i_mtime = inode->i_ctime =
+ CURRENT_TIME;
+
+ /* There is now free space in queue. */
+ pipelined_receive(info);
+ spin_unlock(&info->lock);
+ ret = 0;
+ }
+ if (ret == 0) {
+ ret = msg_ptr->m_ts;
+
+ if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
+ store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
+ ret = -EFAULT;
+ }
+ free_msg(msg_ptr);
+ }
+out_fput:
+ fdput(f);
+out:
+ return ret;
+}
+
+/*
+ * Notes: the case when user wants us to deregister (with NULL as pointer)
+ * and he isn't currently owner of notification, will be silently discarded.
+ * It isn't explicitly defined in the POSIX.
+ */
+SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
+ const struct sigevent __user *, u_notification)
+{
+ int ret;
+ struct fd f;
+ struct sock *sock;
+ struct inode *inode;
+ struct sigevent notification;
+ struct mqueue_inode_info *info;
+ struct sk_buff *nc;
+
+ if (u_notification) {
+ if (copy_from_user(&notification, u_notification,
+ sizeof(struct sigevent)))
+ return -EFAULT;
+ }
+
+ audit_mq_notify(mqdes, u_notification ? &notification : NULL);
+
+ nc = NULL;
+ sock = NULL;
+ if (u_notification != NULL) {
+ if (unlikely(notification.sigev_notify != SIGEV_NONE &&
+ notification.sigev_notify != SIGEV_SIGNAL &&
+ notification.sigev_notify != SIGEV_THREAD))
+ return -EINVAL;
+ if (notification.sigev_notify == SIGEV_SIGNAL &&
+ !valid_signal(notification.sigev_signo)) {
+ return -EINVAL;
+ }
+ if (notification.sigev_notify == SIGEV_THREAD) {
+ long timeo;
+
+ /* create the notify skb */
+ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
+ if (!nc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (copy_from_user(nc->data,
+ notification.sigev_value.sival_ptr,
+ NOTIFY_COOKIE_LEN)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* TODO: add a header? */
+ skb_put(nc, NOTIFY_COOKIE_LEN);
+ /* and attach it to the socket */
+retry:
+ f = fdget(notification.sigev_signo);
+ if (!f.file) {
+ ret = -EBADF;
+ goto out;
+ }
+ sock = netlink_getsockbyfilp(f.file);
+ fdput(f);
+ if (IS_ERR(sock)) {
+ ret = PTR_ERR(sock);
+ sock = NULL;
+ goto out;
+ }
+
+ timeo = MAX_SCHEDULE_TIMEOUT;
+ ret = netlink_attachskb(sock, nc, &timeo, NULL);
+ if (ret == 1)
+ goto retry;
+ if (ret) {
+ sock = NULL;
+ nc = NULL;
+ goto out;
+ }
+ }
+ }
+
+ f = fdget(mqdes);
+ if (!f.file) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ inode = file_inode(f.file);
+ if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+ info = MQUEUE_I(inode);
+
+ ret = 0;
+ spin_lock(&info->lock);
+ if (u_notification == NULL) {
+ if (info->notify_owner == task_tgid(current)) {
+ remove_notification(info);
+ inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ }
+ } else if (info->notify_owner != NULL) {
+ ret = -EBUSY;
+ } else {
+ switch (notification.sigev_notify) {
+ case SIGEV_NONE:
+ info->notify.sigev_notify = SIGEV_NONE;
+ break;
+ case SIGEV_THREAD:
+ info->notify_sock = sock;
+ info->notify_cookie = nc;
+ sock = NULL;
+ nc = NULL;
+ info->notify.sigev_notify = SIGEV_THREAD;
+ break;
+ case SIGEV_SIGNAL:
+ info->notify.sigev_signo = notification.sigev_signo;
+ info->notify.sigev_value = notification.sigev_value;
+ info->notify.sigev_notify = SIGEV_SIGNAL;
+ break;
+ }
+
+ info->notify_owner = get_pid(task_tgid(current));
+ info->notify_user_ns = get_user_ns(current_user_ns());
+ inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ }
+ spin_unlock(&info->lock);
+out_fput:
+ fdput(f);
+out:
+ if (sock)
+ netlink_detachskb(sock, nc);
+ else if (nc)
+ dev_kfree_skb(nc);
+
+ return ret;
+}
+
+SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
+ const struct mq_attr __user *, u_mqstat,
+ struct mq_attr __user *, u_omqstat)
+{
+ int ret;
+ struct mq_attr mqstat, omqstat;
+ struct fd f;
+ struct inode *inode;
+ struct mqueue_inode_info *info;
+
+ if (u_mqstat != NULL) {
+ if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
+ return -EFAULT;
+ if (mqstat.mq_flags & (~O_NONBLOCK))
+ return -EINVAL;
+ }
+
+ f = fdget(mqdes);
+ if (!f.file) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ inode = file_inode(f.file);
+ if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+ ret = -EBADF;
+ goto out_fput;
+ }
+ info = MQUEUE_I(inode);
+
+ spin_lock(&info->lock);
+
+ omqstat = info->attr;
+ omqstat.mq_flags = f.file->f_flags & O_NONBLOCK;
+ if (u_mqstat) {
+ audit_mq_getsetattr(mqdes, &mqstat);
+ spin_lock(&f.file->f_lock);
+ if (mqstat.mq_flags & O_NONBLOCK)
+ f.file->f_flags |= O_NONBLOCK;
+ else
+ f.file->f_flags &= ~O_NONBLOCK;
+ spin_unlock(&f.file->f_lock);
+
+ inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ }
+
+ spin_unlock(&info->lock);
+
+ ret = 0;
+ if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat,
+ sizeof(struct mq_attr)))
+ ret = -EFAULT;
+
+out_fput:
+ fdput(f);
+out:
+ return ret;
+}
+
+static const struct inode_operations mqueue_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .create = mqueue_create,
+ .unlink = mqueue_unlink,
+};
+
+static const struct file_operations mqueue_file_operations = {
+ .flush = mqueue_flush_file,
+ .poll = mqueue_poll_file,
+ .read = mqueue_read_file,
+ .llseek = default_llseek,
+};
+
+static const struct super_operations mqueue_super_ops = {
+ .alloc_inode = mqueue_alloc_inode,
+ .destroy_inode = mqueue_destroy_inode,
+ .evict_inode = mqueue_evict_inode,
+ .statfs = simple_statfs,
+};
+
+static struct file_system_type mqueue_fs_type = {
+ .name = "mqueue",
+ .mount = mqueue_mount,
+ .kill_sb = kill_litter_super,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+int mq_init_ns(struct ipc_namespace *ns)
+{
+ ns->mq_queues_count = 0;
+ ns->mq_queues_max = DFLT_QUEUESMAX;
+ ns->mq_msg_max = DFLT_MSGMAX;
+ ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
+ ns->mq_msg_default = DFLT_MSG;
+ ns->mq_msgsize_default = DFLT_MSGSIZE;
+
+ ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
+ if (IS_ERR(ns->mq_mnt)) {
+ int err = PTR_ERR(ns->mq_mnt);
+ ns->mq_mnt = NULL;
+ return err;
+ }
+ return 0;
+}
+
+void mq_clear_sbinfo(struct ipc_namespace *ns)
+{
+ ns->mq_mnt->mnt_sb->s_fs_info = NULL;
+}
+
+void mq_put_mnt(struct ipc_namespace *ns)
+{
+ kern_unmount(ns->mq_mnt);
+}
+
+static int __init init_mqueue_fs(void)
+{
+ int error;
+
+ mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
+ sizeof(struct mqueue_inode_info), 0,
+ SLAB_HWCACHE_ALIGN, init_once);
+ if (mqueue_inode_cachep == NULL)
+ return -ENOMEM;
+
+ /* ignore failures - they are not fatal */
+ mq_sysctl_table = mq_register_sysctl_table();
+
+ error = register_filesystem(&mqueue_fs_type);
+ if (error)
+ goto out_sysctl;
+
+ spin_lock_init(&mq_lock);
+
+ error = mq_init_ns(&init_ipc_ns);
+ if (error)
+ goto out_filesystem;
+
+ return 0;
+
+out_filesystem:
+ unregister_filesystem(&mqueue_fs_type);
+out_sysctl:
+ if (mq_sysctl_table)
+ unregister_sysctl_table(mq_sysctl_table);
+ kmem_cache_destroy(mqueue_inode_cachep);
+ return error;
+}
+
+device_initcall(init_mqueue_fs);
diff --git a/ipc/msg.c b/ipc/msg.c
new file mode 100644
index 000000000..2b6fdbb9e
--- /dev/null
+++ b/ipc/msg.c
@@ -0,0 +1,1046 @@
+/*
+ * linux/ipc/msg.c
+ * Copyright (C) 1992 Krishna Balasubramanian
+ *
+ * Removed all the remaining kerneld mess
+ * Catch the -EFAULT stuff properly
+ * Use GFP_KERNEL for messages as in 1.2
+ * Fixed up the unchecked user space derefs
+ * Copyright (C) 1998 Alan Cox & Andi Kleen
+ *
+ * /proc/sysvipc/msg support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
+ *
+ * mostly rewritten, threaded and wake-one semantics added
+ * MSGMAX limit removed, sysctl's added
+ * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
+ *
+ * support for audit of ipc object properties and permission changes
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
+ *
+ * namespaces support
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ */
+
+#include <linux/capability.h>
+#include <linux/msg.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/list.h>
+#include <linux/security.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/seq_file.h>
+#include <linux/rwsem.h>
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+
+#include <asm/current.h>
+#include <linux/uaccess.h>
+#include "util.h"
+
+/* one msg_receiver structure for each sleeping receiver */
+struct msg_receiver {
+ struct list_head r_list;
+ struct task_struct *r_tsk;
+
+ int r_mode;
+ long r_msgtype;
+ long r_maxsize;
+
+ /*
+ * Mark r_msg volatile so that the compiler
+ * does not try to get smart and optimize
+ * it. We rely on this for the lockless
+ * receive algorithm.
+ */
+ struct msg_msg *volatile r_msg;
+};
+
+/* one msg_sender for each sleeping sender */
+struct msg_sender {
+ struct list_head list;
+ struct task_struct *tsk;
+};
+
+#define SEARCH_ANY 1
+#define SEARCH_EQUAL 2
+#define SEARCH_NOTEQUAL 3
+#define SEARCH_LESSEQUAL 4
+#define SEARCH_NUMBER 5
+
+#define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS])
+
+static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id)
+{
+ struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id);
+
+ if (IS_ERR(ipcp))
+ return ERR_CAST(ipcp);
+
+ return container_of(ipcp, struct msg_queue, q_perm);
+}
+
+static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns,
+ int id)
+{
+ struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id);
+
+ if (IS_ERR(ipcp))
+ return ERR_CAST(ipcp);
+
+ return container_of(ipcp, struct msg_queue, q_perm);
+}
+
+static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s)
+{
+ ipc_rmid(&msg_ids(ns), &s->q_perm);
+}
+
+static void msg_rcu_free(struct rcu_head *head)
+{
+ struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+ struct msg_queue *msq = ipc_rcu_to_struct(p);
+
+ security_msg_queue_free(msq);
+ ipc_rcu_free(head);
+}
+
+/**
+ * newque - Create a new msg queue
+ * @ns: namespace
+ * @params: ptr to the structure that contains the key and msgflg
+ *
+ * Called with msg_ids.rwsem held (writer)
+ */
+static int newque(struct ipc_namespace *ns, struct ipc_params *params)
+{
+ struct msg_queue *msq;
+ int id, retval;
+ key_t key = params->key;
+ int msgflg = params->flg;
+
+ msq = ipc_rcu_alloc(sizeof(*msq));
+ if (!msq)
+ return -ENOMEM;
+
+ msq->q_perm.mode = msgflg & S_IRWXUGO;
+ msq->q_perm.key = key;
+
+ msq->q_perm.security = NULL;
+ retval = security_msg_queue_alloc(msq);
+ if (retval) {
+ ipc_rcu_putref(msq, ipc_rcu_free);
+ return retval;
+ }
+
+ /* ipc_addid() locks msq upon success. */
+ id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
+ if (id < 0) {
+ ipc_rcu_putref(msq, msg_rcu_free);
+ return id;
+ }
+
+ msq->q_stime = msq->q_rtime = 0;
+ msq->q_ctime = get_seconds();
+ msq->q_cbytes = msq->q_qnum = 0;
+ msq->q_qbytes = ns->msg_ctlmnb;
+ msq->q_lspid = msq->q_lrpid = 0;
+ INIT_LIST_HEAD(&msq->q_messages);
+ INIT_LIST_HEAD(&msq->q_receivers);
+ INIT_LIST_HEAD(&msq->q_senders);
+
+ ipc_unlock_object(&msq->q_perm);
+ rcu_read_unlock();
+
+ return msq->q_perm.id;
+}
+
+static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss)
+{
+ mss->tsk = current;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ list_add_tail(&mss->list, &msq->q_senders);
+}
+
+static inline void ss_del(struct msg_sender *mss)
+{
+ if (mss->list.next != NULL)
+ list_del(&mss->list);
+}
+
+static void ss_wakeup(struct list_head *h, int kill)
+{
+ struct msg_sender *mss, *t;
+
+ list_for_each_entry_safe(mss, t, h, list) {
+ if (kill)
+ mss->list.next = NULL;
+ wake_up_process(mss->tsk);
+ }
+}
+
+static void expunge_all(struct msg_queue *msq, int res)
+{
+ struct msg_receiver *msr, *t;
+
+ list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
+ msr->r_msg = NULL; /* initialize expunge ordering */
+ wake_up_process(msr->r_tsk);
+ /*
+ * Ensure that the wakeup is visible before setting r_msg as
+ * the receiving end depends on it: either spinning on a nil,
+ * or dealing with -EAGAIN cases. See lockless receive part 1
+ * and 2 in do_msgrcv().
+ */
+ smp_mb();
+ msr->r_msg = ERR_PTR(res);
+ }
+}
+
+/*
+ * freeque() wakes up waiters on the sender and receiver waiting queue,
+ * removes the message queue from message queue ID IDR, and cleans up all the
+ * messages associated with this queue.
+ *
+ * msg_ids.rwsem (writer) and the spinlock for this message queue are held
+ * before freeque() is called. msg_ids.rwsem remains locked on exit.
+ */
+static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+{
+ struct msg_msg *msg, *t;
+ struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
+
+ expunge_all(msq, -EIDRM);
+ ss_wakeup(&msq->q_senders, 1);
+ msg_rmid(ns, msq);
+ ipc_unlock_object(&msq->q_perm);
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
+ atomic_dec(&ns->msg_hdrs);
+ free_msg(msg);
+ }
+ atomic_sub(msq->q_cbytes, &ns->msg_bytes);
+ ipc_rcu_putref(msq, msg_rcu_free);
+}
+
+/*
+ * Called with msg_ids.rwsem and ipcp locked.
+ */
+static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg)
+{
+ struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
+
+ return security_msg_queue_associate(msq, msgflg);
+}
+
+SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
+{
+ struct ipc_namespace *ns;
+ static const struct ipc_ops msg_ops = {
+ .getnew = newque,
+ .associate = msg_security,
+ };
+ struct ipc_params msg_params;
+
+ ns = current->nsproxy->ipc_ns;
+
+ msg_params.key = key;
+ msg_params.flg = msgflg;
+
+ return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
+}
+
+static inline unsigned long
+copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version)
+{
+ switch (version) {
+ case IPC_64:
+ return copy_to_user(buf, in, sizeof(*in));
+ case IPC_OLD:
+ {
+ struct msqid_ds out;
+
+ memset(&out, 0, sizeof(out));
+
+ ipc64_perm_to_ipc_perm(&in->msg_perm, &out.msg_perm);
+
+ out.msg_stime = in->msg_stime;
+ out.msg_rtime = in->msg_rtime;
+ out.msg_ctime = in->msg_ctime;
+
+ if (in->msg_cbytes > USHRT_MAX)
+ out.msg_cbytes = USHRT_MAX;
+ else
+ out.msg_cbytes = in->msg_cbytes;
+ out.msg_lcbytes = in->msg_cbytes;
+
+ if (in->msg_qnum > USHRT_MAX)
+ out.msg_qnum = USHRT_MAX;
+ else
+ out.msg_qnum = in->msg_qnum;
+
+ if (in->msg_qbytes > USHRT_MAX)
+ out.msg_qbytes = USHRT_MAX;
+ else
+ out.msg_qbytes = in->msg_qbytes;
+ out.msg_lqbytes = in->msg_qbytes;
+
+ out.msg_lspid = in->msg_lspid;
+ out.msg_lrpid = in->msg_lrpid;
+
+ return copy_to_user(buf, &out, sizeof(out));
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static inline unsigned long
+copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)
+{
+ switch (version) {
+ case IPC_64:
+ if (copy_from_user(out, buf, sizeof(*out)))
+ return -EFAULT;
+ return 0;
+ case IPC_OLD:
+ {
+ struct msqid_ds tbuf_old;
+
+ if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
+ return -EFAULT;
+
+ out->msg_perm.uid = tbuf_old.msg_perm.uid;
+ out->msg_perm.gid = tbuf_old.msg_perm.gid;
+ out->msg_perm.mode = tbuf_old.msg_perm.mode;
+
+ if (tbuf_old.msg_qbytes == 0)
+ out->msg_qbytes = tbuf_old.msg_lqbytes;
+ else
+ out->msg_qbytes = tbuf_old.msg_qbytes;
+
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+/*
+ * This function handles some msgctl commands which require the rwsem
+ * to be held in write mode.
+ * NOTE: no locks must be held, the rwsem is taken inside this function.
+ */
+static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
+ struct msqid_ds __user *buf, int version)
+{
+ struct kern_ipc_perm *ipcp;
+ struct msqid64_ds uninitialized_var(msqid64);
+ struct msg_queue *msq;
+ int err;
+
+ if (cmd == IPC_SET) {
+ if (copy_msqid_from_user(&msqid64, buf, version))
+ return -EFAULT;
+ }
+
+ down_write(&msg_ids(ns).rwsem);
+ rcu_read_lock();
+
+ ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd,
+ &msqid64.msg_perm, msqid64.msg_qbytes);
+ if (IS_ERR(ipcp)) {
+ err = PTR_ERR(ipcp);
+ goto out_unlock1;
+ }
+
+ msq = container_of(ipcp, struct msg_queue, q_perm);
+
+ err = security_msg_queue_msgctl(msq, cmd);
+ if (err)
+ goto out_unlock1;
+
+ switch (cmd) {
+ case IPC_RMID:
+ ipc_lock_object(&msq->q_perm);
+ /* freeque unlocks the ipc object and rcu */
+ freeque(ns, ipcp);
+ goto out_up;
+ case IPC_SET:
+ if (msqid64.msg_qbytes > ns->msg_ctlmnb &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EPERM;
+ goto out_unlock1;
+ }
+
+ ipc_lock_object(&msq->q_perm);
+ err = ipc_update_perm(&msqid64.msg_perm, ipcp);
+ if (err)
+ goto out_unlock0;
+
+ msq->q_qbytes = msqid64.msg_qbytes;
+
+ msq->q_ctime = get_seconds();
+ /* sleeping receivers might be excluded by
+ * stricter permissions.
+ */
+ expunge_all(msq, -EAGAIN);
+ /* sleeping senders might be able to send
+ * due to a larger queue size.
+ */
+ ss_wakeup(&msq->q_senders, 0);
+ break;
+ default:
+ err = -EINVAL;
+ goto out_unlock1;
+ }
+
+out_unlock0:
+ ipc_unlock_object(&msq->q_perm);
+out_unlock1:
+ rcu_read_unlock();
+out_up:
+ up_write(&msg_ids(ns).rwsem);
+ return err;
+}
+
+static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
+ int cmd, int version, void __user *buf)
+{
+ int err;
+ struct msg_queue *msq;
+
+ switch (cmd) {
+ case IPC_INFO:
+ case MSG_INFO:
+ {
+ struct msginfo msginfo;
+ int max_id;
+
+ if (!buf)
+ return -EFAULT;
+
+ /*
+ * We must not return kernel stack data.
+ * due to padding, it's not enough
+ * to set all member fields.
+ */
+ err = security_msg_queue_msgctl(NULL, cmd);
+ if (err)
+ return err;
+
+ memset(&msginfo, 0, sizeof(msginfo));
+ msginfo.msgmni = ns->msg_ctlmni;
+ msginfo.msgmax = ns->msg_ctlmax;
+ msginfo.msgmnb = ns->msg_ctlmnb;
+ msginfo.msgssz = MSGSSZ;
+ msginfo.msgseg = MSGSEG;
+ down_read(&msg_ids(ns).rwsem);
+ if (cmd == MSG_INFO) {
+ msginfo.msgpool = msg_ids(ns).in_use;
+ msginfo.msgmap = atomic_read(&ns->msg_hdrs);
+ msginfo.msgtql = atomic_read(&ns->msg_bytes);
+ } else {
+ msginfo.msgmap = MSGMAP;
+ msginfo.msgpool = MSGPOOL;
+ msginfo.msgtql = MSGTQL;
+ }
+ max_id = ipc_get_maxid(&msg_ids(ns));
+ up_read(&msg_ids(ns).rwsem);
+ if (copy_to_user(buf, &msginfo, sizeof(struct msginfo)))
+ return -EFAULT;
+ return (max_id < 0) ? 0 : max_id;
+ }
+
+ case MSG_STAT:
+ case IPC_STAT:
+ {
+ struct msqid64_ds tbuf;
+ int success_return;
+
+ if (!buf)
+ return -EFAULT;
+
+ memset(&tbuf, 0, sizeof(tbuf));
+
+ rcu_read_lock();
+ if (cmd == MSG_STAT) {
+ msq = msq_obtain_object(ns, msqid);
+ if (IS_ERR(msq)) {
+ err = PTR_ERR(msq);
+ goto out_unlock;
+ }
+ success_return = msq->q_perm.id;
+ } else {
+ msq = msq_obtain_object_check(ns, msqid);
+ if (IS_ERR(msq)) {
+ err = PTR_ERR(msq);
+ goto out_unlock;
+ }
+ success_return = 0;
+ }
+
+ err = -EACCES;
+ if (ipcperms(ns, &msq->q_perm, S_IRUGO))
+ goto out_unlock;
+
+ err = security_msg_queue_msgctl(msq, cmd);
+ if (err)
+ goto out_unlock;
+
+ kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm);
+ tbuf.msg_stime = msq->q_stime;
+ tbuf.msg_rtime = msq->q_rtime;
+ tbuf.msg_ctime = msq->q_ctime;
+ tbuf.msg_cbytes = msq->q_cbytes;
+ tbuf.msg_qnum = msq->q_qnum;
+ tbuf.msg_qbytes = msq->q_qbytes;
+ tbuf.msg_lspid = msq->q_lspid;
+ tbuf.msg_lrpid = msq->q_lrpid;
+ rcu_read_unlock();
+
+ if (copy_msqid_to_user(buf, &tbuf, version))
+ return -EFAULT;
+ return success_return;
+ }
+
+ default:
+ return -EINVAL;
+ }
+
+ return err;
+out_unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)
+{
+ int version;
+ struct ipc_namespace *ns;
+
+ if (msqid < 0 || cmd < 0)
+ return -EINVAL;
+
+ version = ipc_parse_version(&cmd);
+ ns = current->nsproxy->ipc_ns;
+
+ switch (cmd) {
+ case IPC_INFO:
+ case MSG_INFO:
+ case MSG_STAT: /* msqid is an index rather than a msg queue id */
+ case IPC_STAT:
+ return msgctl_nolock(ns, msqid, cmd, version, buf);
+ case IPC_SET:
+ case IPC_RMID:
+ return msgctl_down(ns, msqid, cmd, buf, version);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int testmsg(struct msg_msg *msg, long type, int mode)
+{
+ switch (mode) {
+ case SEARCH_ANY:
+ case SEARCH_NUMBER:
+ return 1;
+ case SEARCH_LESSEQUAL:
+ if (msg->m_type <= type)
+ return 1;
+ break;
+ case SEARCH_EQUAL:
+ if (msg->m_type == type)
+ return 1;
+ break;
+ case SEARCH_NOTEQUAL:
+ if (msg->m_type != type)
+ return 1;
+ break;
+ }
+ return 0;
+}
+
+static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
+{
+ struct msg_receiver *msr, *t;
+
+ list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
+ if (testmsg(msg, msr->r_msgtype, msr->r_mode) &&
+ !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
+ msr->r_msgtype, msr->r_mode)) {
+
+ list_del(&msr->r_list);
+ if (msr->r_maxsize < msg->m_ts) {
+ /* initialize pipelined send ordering */
+ msr->r_msg = NULL;
+ wake_up_process(msr->r_tsk);
+ smp_mb(); /* see barrier comment below */
+ msr->r_msg = ERR_PTR(-E2BIG);
+ } else {
+ msr->r_msg = NULL;
+ msq->q_lrpid = task_pid_vnr(msr->r_tsk);
+ msq->q_rtime = get_seconds();
+ wake_up_process(msr->r_tsk);
+ /*
+ * Ensure that the wakeup is visible before
+ * setting r_msg, as the receiving end depends
+ * on it. See lockless receive part 1 and 2 in
+ * do_msgrcv().
+ */
+ smp_mb();
+ msr->r_msg = msg;
+
+ return 1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+long do_msgsnd(int msqid, long mtype, void __user *mtext,
+ size_t msgsz, int msgflg)
+{
+ struct msg_queue *msq;
+ struct msg_msg *msg;
+ int err;
+ struct ipc_namespace *ns;
+
+ ns = current->nsproxy->ipc_ns;
+
+ if (msgsz > ns->msg_ctlmax || (long) msgsz < 0 || msqid < 0)
+ return -EINVAL;
+ if (mtype < 1)
+ return -EINVAL;
+
+ msg = load_msg(mtext, msgsz);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+
+ msg->m_type = mtype;
+ msg->m_ts = msgsz;
+
+ rcu_read_lock();
+ msq = msq_obtain_object_check(ns, msqid);
+ if (IS_ERR(msq)) {
+ err = PTR_ERR(msq);
+ goto out_unlock1;
+ }
+
+ ipc_lock_object(&msq->q_perm);
+
+ for (;;) {
+ struct msg_sender s;
+
+ err = -EACCES;
+ if (ipcperms(ns, &msq->q_perm, S_IWUGO))
+ goto out_unlock0;
+
+ /* raced with RMID? */
+ if (!ipc_valid_object(&msq->q_perm)) {
+ err = -EIDRM;
+ goto out_unlock0;
+ }
+
+ err = security_msg_queue_msgsnd(msq, msg, msgflg);
+ if (err)
+ goto out_unlock0;
+
+ if (msgsz + msq->q_cbytes <= msq->q_qbytes &&
+ 1 + msq->q_qnum <= msq->q_qbytes) {
+ break;
+ }
+
+ /* queue full, wait: */
+ if (msgflg & IPC_NOWAIT) {
+ err = -EAGAIN;
+ goto out_unlock0;
+ }
+
+ /* enqueue the sender and prepare to block */
+ ss_add(msq, &s);
+
+ if (!ipc_rcu_getref(msq)) {
+ err = -EIDRM;
+ goto out_unlock0;
+ }
+
+ ipc_unlock_object(&msq->q_perm);
+ rcu_read_unlock();
+ schedule();
+
+ rcu_read_lock();
+ ipc_lock_object(&msq->q_perm);
+
+ ipc_rcu_putref(msq, ipc_rcu_free);
+ /* raced with RMID? */
+ if (!ipc_valid_object(&msq->q_perm)) {
+ err = -EIDRM;
+ goto out_unlock0;
+ }
+
+ ss_del(&s);
+
+ if (signal_pending(current)) {
+ err = -ERESTARTNOHAND;
+ goto out_unlock0;
+ }
+
+ }
+ msq->q_lspid = task_tgid_vnr(current);
+ msq->q_stime = get_seconds();
+
+ if (!pipelined_send(msq, msg)) {
+ /* no one is waiting for this message, enqueue it */
+ list_add_tail(&msg->m_list, &msq->q_messages);
+ msq->q_cbytes += msgsz;
+ msq->q_qnum++;
+ atomic_add(msgsz, &ns->msg_bytes);
+ atomic_inc(&ns->msg_hdrs);
+ }
+
+ err = 0;
+ msg = NULL;
+
+out_unlock0:
+ ipc_unlock_object(&msq->q_perm);
+out_unlock1:
+ rcu_read_unlock();
+ if (msg != NULL)
+ free_msg(msg);
+ return err;
+}
+
+SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
+ int, msgflg)
+{
+ long mtype;
+
+ if (get_user(mtype, &msgp->mtype))
+ return -EFAULT;
+ return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg);
+}
+
+static inline int convert_mode(long *msgtyp, int msgflg)
+{
+ if (msgflg & MSG_COPY)
+ return SEARCH_NUMBER;
+ /*
+ * find message of correct type.
+ * msgtyp = 0 => get first.
+ * msgtyp > 0 => get first message of matching type.
+ * msgtyp < 0 => get message with least type must be < abs(msgtype).
+ */
+ if (*msgtyp == 0)
+ return SEARCH_ANY;
+ if (*msgtyp < 0) {
+ *msgtyp = -*msgtyp;
+ return SEARCH_LESSEQUAL;
+ }
+ if (msgflg & MSG_EXCEPT)
+ return SEARCH_NOTEQUAL;
+ return SEARCH_EQUAL;
+}
+
+static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
+{
+ struct msgbuf __user *msgp = dest;
+ size_t msgsz;
+
+ if (put_user(msg->m_type, &msgp->mtype))
+ return -EFAULT;
+
+ msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz;
+ if (store_msg(msgp->mtext, msg, msgsz))
+ return -EFAULT;
+ return msgsz;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * This function creates new kernel message structure, large enough to store
+ * bufsz message bytes.
+ */
+static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
+{
+ struct msg_msg *copy;
+
+ /*
+ * Create dummy message to copy real message to.
+ */
+ copy = load_msg(buf, bufsz);
+ if (!IS_ERR(copy))
+ copy->m_ts = bufsz;
+ return copy;
+}
+
+static inline void free_copy(struct msg_msg *copy)
+{
+ if (copy)
+ free_msg(copy);
+}
+#else
+static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
+{
+ return ERR_PTR(-ENOSYS);
+}
+
+static inline void free_copy(struct msg_msg *copy)
+{
+}
+#endif
+
+static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode)
+{
+ struct msg_msg *msg, *found = NULL;
+ long count = 0;
+
+ list_for_each_entry(msg, &msq->q_messages, m_list) {
+ if (testmsg(msg, *msgtyp, mode) &&
+ !security_msg_queue_msgrcv(msq, msg, current,
+ *msgtyp, mode)) {
+ if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) {
+ *msgtyp = msg->m_type - 1;
+ found = msg;
+ } else if (mode == SEARCH_NUMBER) {
+ if (*msgtyp == count)
+ return msg;
+ } else
+ return msg;
+ count++;
+ }
+ }
+
+ return found ?: ERR_PTR(-EAGAIN);
+}
+
+long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg,
+ long (*msg_handler)(void __user *, struct msg_msg *, size_t))
+{
+ int mode;
+ struct msg_queue *msq;
+ struct ipc_namespace *ns;
+ struct msg_msg *msg, *copy = NULL;
+
+ ns = current->nsproxy->ipc_ns;
+
+ if (msqid < 0 || (long) bufsz < 0)
+ return -EINVAL;
+
+ if (msgflg & MSG_COPY) {
+ if ((msgflg & MSG_EXCEPT) || !(msgflg & IPC_NOWAIT))
+ return -EINVAL;
+ copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax));
+ if (IS_ERR(copy))
+ return PTR_ERR(copy);
+ }
+ mode = convert_mode(&msgtyp, msgflg);
+
+ rcu_read_lock();
+ msq = msq_obtain_object_check(ns, msqid);
+ if (IS_ERR(msq)) {
+ rcu_read_unlock();
+ free_copy(copy);
+ return PTR_ERR(msq);
+ }
+
+ for (;;) {
+ struct msg_receiver msr_d;
+
+ msg = ERR_PTR(-EACCES);
+ if (ipcperms(ns, &msq->q_perm, S_IRUGO))
+ goto out_unlock1;
+
+ ipc_lock_object(&msq->q_perm);
+
+ /* raced with RMID? */
+ if (!ipc_valid_object(&msq->q_perm)) {
+ msg = ERR_PTR(-EIDRM);
+ goto out_unlock0;
+ }
+
+ msg = find_msg(msq, &msgtyp, mode);
+ if (!IS_ERR(msg)) {
+ /*
+ * Found a suitable message.
+ * Unlink it from the queue.
+ */
+ if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) {
+ msg = ERR_PTR(-E2BIG);
+ goto out_unlock0;
+ }
+ /*
+ * If we are copying, then do not unlink message and do
+ * not update queue parameters.
+ */
+ if (msgflg & MSG_COPY) {
+ msg = copy_msg(msg, copy);
+ goto out_unlock0;
+ }
+
+ list_del(&msg->m_list);
+ msq->q_qnum--;
+ msq->q_rtime = get_seconds();
+ msq->q_lrpid = task_tgid_vnr(current);
+ msq->q_cbytes -= msg->m_ts;
+ atomic_sub(msg->m_ts, &ns->msg_bytes);
+ atomic_dec(&ns->msg_hdrs);
+ ss_wakeup(&msq->q_senders, 0);
+
+ goto out_unlock0;
+ }
+
+ /* No message waiting. Wait for a message */
+ if (msgflg & IPC_NOWAIT) {
+ msg = ERR_PTR(-ENOMSG);
+ goto out_unlock0;
+ }
+
+ list_add_tail(&msr_d.r_list, &msq->q_receivers);
+ msr_d.r_tsk = current;
+ msr_d.r_msgtype = msgtyp;
+ msr_d.r_mode = mode;
+ if (msgflg & MSG_NOERROR)
+ msr_d.r_maxsize = INT_MAX;
+ else
+ msr_d.r_maxsize = bufsz;
+ msr_d.r_msg = ERR_PTR(-EAGAIN);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
+ ipc_unlock_object(&msq->q_perm);
+ rcu_read_unlock();
+ schedule();
+
+ /* Lockless receive, part 1:
+ * Disable preemption. We don't hold a reference to the queue
+ * and getting a reference would defeat the idea of a lockless
+ * operation, thus the code relies on rcu to guarantee the
+ * existence of msq:
+ * Prior to destruction, expunge_all(-EIRDM) changes r_msg.
+ * Thus if r_msg is -EAGAIN, then the queue not yet destroyed.
+ * rcu_read_lock() prevents preemption between reading r_msg
+ * and acquiring the q_perm.lock in ipc_lock_object().
+ */
+ rcu_read_lock();
+
+ /* Lockless receive, part 2:
+ * Wait until pipelined_send or expunge_all are outside of
+ * wake_up_process(). There is a race with exit(), see
+ * ipc/mqueue.c for the details.
+ */
+ msg = (struct msg_msg *)msr_d.r_msg;
+ while (msg == NULL) {
+ cpu_relax();
+ msg = (struct msg_msg *)msr_d.r_msg;
+ }
+
+ /* Lockless receive, part 3:
+ * If there is a message or an error then accept it without
+ * locking.
+ */
+ if (msg != ERR_PTR(-EAGAIN))
+ goto out_unlock1;
+
+ /* Lockless receive, part 3:
+ * Acquire the queue spinlock.
+ */
+ ipc_lock_object(&msq->q_perm);
+
+ /* Lockless receive, part 4:
+ * Repeat test after acquiring the spinlock.
+ */
+ msg = (struct msg_msg *)msr_d.r_msg;
+ if (msg != ERR_PTR(-EAGAIN))
+ goto out_unlock0;
+
+ list_del(&msr_d.r_list);
+ if (signal_pending(current)) {
+ msg = ERR_PTR(-ERESTARTNOHAND);
+ goto out_unlock0;
+ }
+
+ ipc_unlock_object(&msq->q_perm);
+ }
+
+out_unlock0:
+ ipc_unlock_object(&msq->q_perm);
+out_unlock1:
+ rcu_read_unlock();
+ if (IS_ERR(msg)) {
+ free_copy(copy);
+ return PTR_ERR(msg);
+ }
+
+ bufsz = msg_handler(buf, msg, bufsz);
+ free_msg(msg);
+
+ return bufsz;
+}
+
+SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
+ long, msgtyp, int, msgflg)
+{
+ return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill);
+}
+
+
+void msg_init_ns(struct ipc_namespace *ns)
+{
+ ns->msg_ctlmax = MSGMAX;
+ ns->msg_ctlmnb = MSGMNB;
+ ns->msg_ctlmni = MSGMNI;
+
+ atomic_set(&ns->msg_bytes, 0);
+ atomic_set(&ns->msg_hdrs, 0);
+ ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+}
+
+#ifdef CONFIG_IPC_NS
+void msg_exit_ns(struct ipc_namespace *ns)
+{
+ free_ipcs(ns, &msg_ids(ns), freeque);
+ idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
+{
+ struct user_namespace *user_ns = seq_user_ns(s);
+ struct msg_queue *msq = it;
+
+ seq_printf(s,
+ "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
+ msq->q_perm.key,
+ msq->q_perm.id,
+ msq->q_perm.mode,
+ msq->q_cbytes,
+ msq->q_qnum,
+ msq->q_lspid,
+ msq->q_lrpid,
+ from_kuid_munged(user_ns, msq->q_perm.uid),
+ from_kgid_munged(user_ns, msq->q_perm.gid),
+ from_kuid_munged(user_ns, msq->q_perm.cuid),
+ from_kgid_munged(user_ns, msq->q_perm.cgid),
+ msq->q_stime,
+ msq->q_rtime,
+ msq->q_ctime);
+
+ return 0;
+}
+#endif
+
+void __init msg_init(void)
+{
+ msg_init_ns(&init_ipc_ns);
+
+ ipc_init_proc_interface("sysvipc/msg",
+ " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
+ IPC_MSG_IDS, sysvipc_msg_proc_show);
+}
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
new file mode 100644
index 000000000..2b491590e
--- /dev/null
+++ b/ipc/msgutil.c
@@ -0,0 +1,185 @@
+/*
+ * linux/ipc/msgutil.c
+ * Copyright (C) 1999, 2004 Manfred Spraul
+ *
+ * This file is released under GNU General Public Licence version 2 or
+ * (at your option) any later version.
+ *
+ * See the file COPYING for more details.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/ipc.h>
+#include <linux/msg.h>
+#include <linux/ipc_namespace.h>
+#include <linux/utsname.h>
+#include <linux/proc_ns.h>
+#include <linux/uaccess.h>
+
+#include "util.h"
+
+DEFINE_SPINLOCK(mq_lock);
+
+/*
+ * The next 2 defines are here bc this is the only file
+ * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
+ * and not CONFIG_IPC_NS.
+ */
+struct ipc_namespace init_ipc_ns = {
+ .count = ATOMIC_INIT(1),
+ .user_ns = &init_user_ns,
+ .ns.inum = PROC_IPC_INIT_INO,
+#ifdef CONFIG_IPC_NS
+ .ns.ops = &ipcns_operations,
+#endif
+};
+
+atomic_t nr_ipc_ns = ATOMIC_INIT(1);
+
+struct msg_msgseg {
+ struct msg_msgseg *next;
+ /* the next part of the message follows immediately */
+};
+
+#define DATALEN_MSG ((size_t)PAGE_SIZE-sizeof(struct msg_msg))
+#define DATALEN_SEG ((size_t)PAGE_SIZE-sizeof(struct msg_msgseg))
+
+
+static struct msg_msg *alloc_msg(size_t len)
+{
+ struct msg_msg *msg;
+ struct msg_msgseg **pseg;
+ size_t alen;
+
+ alen = min(len, DATALEN_MSG);
+ msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+ if (msg == NULL)
+ return NULL;
+
+ msg->next = NULL;
+ msg->security = NULL;
+
+ len -= alen;
+ pseg = &msg->next;
+ while (len > 0) {
+ struct msg_msgseg *seg;
+ alen = min(len, DATALEN_SEG);
+ seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL);
+ if (seg == NULL)
+ goto out_err;
+ *pseg = seg;
+ seg->next = NULL;
+ pseg = &seg->next;
+ len -= alen;
+ }
+
+ return msg;
+
+out_err:
+ free_msg(msg);
+ return NULL;
+}
+
+struct msg_msg *load_msg(const void __user *src, size_t len)
+{
+ struct msg_msg *msg;
+ struct msg_msgseg *seg;
+ int err = -EFAULT;
+ size_t alen;
+
+ msg = alloc_msg(len);
+ if (msg == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ alen = min(len, DATALEN_MSG);
+ if (copy_from_user(msg + 1, src, alen))
+ goto out_err;
+
+ for (seg = msg->next; seg != NULL; seg = seg->next) {
+ len -= alen;
+ src = (char __user *)src + alen;
+ alen = min(len, DATALEN_SEG);
+ if (copy_from_user(seg + 1, src, alen))
+ goto out_err;
+ }
+
+ err = security_msg_msg_alloc(msg);
+ if (err)
+ goto out_err;
+
+ return msg;
+
+out_err:
+ free_msg(msg);
+ return ERR_PTR(err);
+}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
+{
+ struct msg_msgseg *dst_pseg, *src_pseg;
+ size_t len = src->m_ts;
+ size_t alen;
+
+ BUG_ON(dst == NULL);
+ if (src->m_ts > dst->m_ts)
+ return ERR_PTR(-EINVAL);
+
+ alen = min(len, DATALEN_MSG);
+ memcpy(dst + 1, src + 1, alen);
+
+ for (dst_pseg = dst->next, src_pseg = src->next;
+ src_pseg != NULL;
+ dst_pseg = dst_pseg->next, src_pseg = src_pseg->next) {
+
+ len -= alen;
+ alen = min(len, DATALEN_SEG);
+ memcpy(dst_pseg + 1, src_pseg + 1, alen);
+ }
+
+ dst->m_type = src->m_type;
+ dst->m_ts = src->m_ts;
+
+ return dst;
+}
+#else
+struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
+{
+ return ERR_PTR(-ENOSYS);
+}
+#endif
+int store_msg(void __user *dest, struct msg_msg *msg, size_t len)
+{
+ size_t alen;
+ struct msg_msgseg *seg;
+
+ alen = min(len, DATALEN_MSG);
+ if (copy_to_user(dest, msg + 1, alen))
+ return -1;
+
+ for (seg = msg->next; seg != NULL; seg = seg->next) {
+ len -= alen;
+ dest = (char __user *)dest + alen;
+ alen = min(len, DATALEN_SEG);
+ if (copy_to_user(dest, seg + 1, alen))
+ return -1;
+ }
+ return 0;
+}
+
+void free_msg(struct msg_msg *msg)
+{
+ struct msg_msgseg *seg;
+
+ security_msg_msg_free(msg);
+
+ seg = msg->next;
+ kfree(msg);
+ while (seg != NULL) {
+ struct msg_msgseg *tmp = seg->next;
+ kfree(seg);
+ seg = tmp;
+ }
+}
diff --git a/ipc/namespace.c b/ipc/namespace.c
new file mode 100644
index 000000000..068caf18d
--- /dev/null
+++ b/ipc/namespace.c
@@ -0,0 +1,175 @@
+/*
+ * linux/ipc/namespace.c
+ * Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc.
+ */
+
+#include <linux/ipc.h>
+#include <linux/msg.h>
+#include <linux/ipc_namespace.h>
+#include <linux/rcupdate.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
+
+#include "util.h"
+
+static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
+ struct ipc_namespace *old_ns)
+{
+ struct ipc_namespace *ns;
+ int err;
+
+ ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
+ if (ns == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err) {
+ kfree(ns);
+ return ERR_PTR(err);
+ }
+ ns->ns.ops = &ipcns_operations;
+
+ atomic_set(&ns->count, 1);
+ err = mq_init_ns(ns);
+ if (err) {
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+ return ERR_PTR(err);
+ }
+ atomic_inc(&nr_ipc_ns);
+
+ sem_init_ns(ns);
+ msg_init_ns(ns);
+ shm_init_ns(ns);
+
+ ns->user_ns = get_user_ns(user_ns);
+
+ return ns;
+}
+
+struct ipc_namespace *copy_ipcs(unsigned long flags,
+ struct user_namespace *user_ns, struct ipc_namespace *ns)
+{
+ if (!(flags & CLONE_NEWIPC))
+ return get_ipc_ns(ns);
+ return create_ipc_ns(user_ns, ns);
+}
+
+/*
+ * free_ipcs - free all ipcs of one type
+ * @ns: the namespace to remove the ipcs from
+ * @ids: the table of ipcs to free
+ * @free: the function called to free each individual ipc
+ *
+ * Called for each kind of ipc when an ipc_namespace exits.
+ */
+void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
+ void (*free)(struct ipc_namespace *, struct kern_ipc_perm *))
+{
+ struct kern_ipc_perm *perm;
+ int next_id;
+ int total, in_use;
+
+ down_write(&ids->rwsem);
+
+ in_use = ids->in_use;
+
+ for (total = 0, next_id = 0; total < in_use; next_id++) {
+ perm = idr_find(&ids->ipcs_idr, next_id);
+ if (perm == NULL)
+ continue;
+ rcu_read_lock();
+ ipc_lock_object(perm);
+ free(ns, perm);
+ total++;
+ }
+ up_write(&ids->rwsem);
+}
+
+static void free_ipc_ns(struct ipc_namespace *ns)
+{
+ sem_exit_ns(ns);
+ msg_exit_ns(ns);
+ shm_exit_ns(ns);
+ atomic_dec(&nr_ipc_ns);
+
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+}
+
+/*
+ * put_ipc_ns - drop a reference to an ipc namespace.
+ * @ns: the namespace to put
+ *
+ * If this is the last task in the namespace exiting, and
+ * it is dropping the refcount to 0, then it can race with
+ * a task in another ipc namespace but in a mounts namespace
+ * which has this ipcns's mqueuefs mounted, doing some action
+ * with one of the mqueuefs files. That can raise the refcount.
+ * So dropping the refcount, and raising the refcount when
+ * accessing it through the VFS, are protected with mq_lock.
+ *
+ * (Clearly, a task raising the refcount on its own ipc_ns
+ * needn't take mq_lock since it can't race with the last task
+ * in the ipcns exiting).
+ */
+void put_ipc_ns(struct ipc_namespace *ns)
+{
+ if (atomic_dec_and_lock(&ns->count, &mq_lock)) {
+ mq_clear_sbinfo(ns);
+ spin_unlock(&mq_lock);
+ mq_put_mnt(ns);
+ free_ipc_ns(ns);
+ }
+}
+
+static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct ipc_namespace, ns);
+}
+
+static struct ns_common *ipcns_get(struct task_struct *task)
+{
+ struct ipc_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy)
+ ns = get_ipc_ns(nsproxy->ipc_ns);
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void ipcns_put(struct ns_common *ns)
+{
+ return put_ipc_ns(to_ipc_ns(ns));
+}
+
+static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+ struct ipc_namespace *ns = to_ipc_ns(new);
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Ditch state from the old ipc namespace */
+ exit_sem(current);
+ put_ipc_ns(nsproxy->ipc_ns);
+ nsproxy->ipc_ns = get_ipc_ns(ns);
+ return 0;
+}
+
+const struct proc_ns_operations ipcns_operations = {
+ .name = "ipc",
+ .type = CLONE_NEWIPC,
+ .get = ipcns_get,
+ .put = ipcns_put,
+ .install = ipcns_install,
+};
diff --git a/ipc/sem.c b/ipc/sem.c
new file mode 100644
index 000000000..d1a6edd17
--- /dev/null
+++ b/ipc/sem.c
@@ -0,0 +1,2188 @@
+/*
+ * linux/ipc/sem.c
+ * Copyright (C) 1992 Krishna Balasubramanian
+ * Copyright (C) 1995 Eric Schenk, Bruno Haible
+ *
+ * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
+ *
+ * SMP-threaded, sysctl's added
+ * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
+ * Enforced range limit on SEM_UNDO
+ * (c) 2001 Red Hat Inc
+ * Lockless wakeup
+ * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
+ * Further wakeup optimizations, documentation
+ * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
+ *
+ * support for audit of ipc object properties and permission changes
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
+ *
+ * namespaces support
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ *
+ * Implementation notes: (May 2010)
+ * This file implements System V semaphores.
+ *
+ * User space visible behavior:
+ * - FIFO ordering for semop() operations (just FIFO, not starvation
+ * protection)
+ * - multiple semaphore operations that alter the same semaphore in
+ * one semop() are handled.
+ * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
+ * SETALL calls.
+ * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
+ * - undo adjustments at process exit are limited to 0..SEMVMX.
+ * - namespace are supported.
+ * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
+ * to /proc/sys/kernel/sem.
+ * - statistics about the usage are reported in /proc/sysvipc/sem.
+ *
+ * Internals:
+ * - scalability:
+ * - all global variables are read-mostly.
+ * - semop() calls and semctl(RMID) are synchronized by RCU.
+ * - most operations do write operations (actually: spin_lock calls) to
+ * the per-semaphore array structure.
+ * Thus: Perfect SMP scaling between independent semaphore arrays.
+ * If multiple semaphores in one array are used, then cache line
+ * trashing on the semaphore array spinlock will limit the scaling.
+ * - semncnt and semzcnt are calculated on demand in count_semcnt()
+ * - the task that performs a successful semop() scans the list of all
+ * sleeping tasks and completes any pending operations that can be fulfilled.
+ * Semaphores are actively given to waiting tasks (necessary for FIFO).
+ * (see update_queue())
+ * - To improve the scalability, the actual wake-up calls are performed after
+ * dropping all locks. (see wake_up_sem_queue_prepare(),
+ * wake_up_sem_queue_do())
+ * - All work is done by the waker, the woken up task does not have to do
+ * anything - not even acquiring a lock or dropping a refcount.
+ * - A woken up task may not even touch the semaphore array anymore, it may
+ * have been destroyed already by a semctl(RMID).
+ * - The synchronizations between wake-ups due to a timeout/signal and a
+ * wake-up due to a completed semaphore operation is achieved by using an
+ * intermediate state (IN_WAKEUP).
+ * - UNDO values are stored in an array (one per process and per
+ * semaphore array, lazily allocated). For backwards compatibility, multiple
+ * modes for the UNDO variables are supported (per process, per thread)
+ * (see copy_semundo, CLONE_SYSVSEM)
+ * - There are two lists of the pending operations: a per-array list
+ * and per-semaphore list (stored in the array). This allows to achieve FIFO
+ * ordering without always scanning all pending operations.
+ * The worst-case behavior is nevertheless O(N^2) for N wakeups.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/time.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/seq_file.h>
+#include <linux/rwsem.h>
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+
+#include <linux/uaccess.h>
+#include "util.h"
+
+/* One semaphore structure for each semaphore in the system. */
+struct sem {
+ int semval; /* current value */
+ int sempid; /* pid of last operation */
+ spinlock_t lock; /* spinlock for fine-grained semtimedop */
+ struct list_head pending_alter; /* pending single-sop operations */
+ /* that alter the semaphore */
+ struct list_head pending_const; /* pending single-sop operations */
+ /* that do not alter the semaphore*/
+ time_t sem_otime; /* candidate for sem_otime */
+} ____cacheline_aligned_in_smp;
+
+/* One queue for each sleeping process in the system. */
+struct sem_queue {
+ struct list_head list; /* queue of pending operations */
+ struct task_struct *sleeper; /* this process */
+ struct sem_undo *undo; /* undo structure */
+ int pid; /* process id of requesting process */
+ int status; /* completion status of operation */
+ struct sembuf *sops; /* array of pending operations */
+ struct sembuf *blocking; /* the operation that blocked */
+ int nsops; /* number of operations */
+ int alter; /* does *sops alter the array? */
+};
+
+/* Each task has a list of undo requests. They are executed automatically
+ * when the process exits.
+ */
+struct sem_undo {
+ struct list_head list_proc; /* per-process list: *
+ * all undos from one process
+ * rcu protected */
+ struct rcu_head rcu; /* rcu struct for sem_undo */
+ struct sem_undo_list *ulp; /* back ptr to sem_undo_list */
+ struct list_head list_id; /* per semaphore array list:
+ * all undos for one array */
+ int semid; /* semaphore set identifier */
+ short *semadj; /* array of adjustments */
+ /* one per semaphore */
+};
+
+/* sem_undo_list controls shared access to the list of sem_undo structures
+ * that may be shared among all a CLONE_SYSVSEM task group.
+ */
+struct sem_undo_list {
+ atomic_t refcnt;
+ spinlock_t lock;
+ struct list_head list_proc;
+};
+
+
+#define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS])
+
+#define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid)
+
+static int newary(struct ipc_namespace *, struct ipc_params *);
+static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
+#ifdef CONFIG_PROC_FS
+static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
+#endif
+
+#define SEMMSL_FAST 256 /* 512 bytes on stack */
+#define SEMOPM_FAST 64 /* ~ 372 bytes on stack */
+
+/*
+ * Locking:
+ * sem_undo.id_next,
+ * sem_array.complex_count,
+ * sem_array.pending{_alter,_cont},
+ * sem_array.sem_undo: global sem_lock() for read/write
+ * sem_undo.proc_next: only "current" is allowed to read/write that field.
+ *
+ * sem_array.sem_base[i].pending_{const,alter}:
+ * global or semaphore sem_lock() for read/write
+ */
+
+#define sc_semmsl sem_ctls[0]
+#define sc_semmns sem_ctls[1]
+#define sc_semopm sem_ctls[2]
+#define sc_semmni sem_ctls[3]
+
+void sem_init_ns(struct ipc_namespace *ns)
+{
+ ns->sc_semmsl = SEMMSL;
+ ns->sc_semmns = SEMMNS;
+ ns->sc_semopm = SEMOPM;
+ ns->sc_semmni = SEMMNI;
+ ns->used_sems = 0;
+ ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
+}
+
+#ifdef CONFIG_IPC_NS
+void sem_exit_ns(struct ipc_namespace *ns)
+{
+ free_ipcs(ns, &sem_ids(ns), freeary);
+ idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
+}
+#endif
+
+void __init sem_init(void)
+{
+ sem_init_ns(&init_ipc_ns);
+ ipc_init_proc_interface("sysvipc/sem",
+ " key semid perms nsems uid gid cuid cgid otime ctime\n",
+ IPC_SEM_IDS, sysvipc_sem_proc_show);
+}
+
+/**
+ * unmerge_queues - unmerge queues, if possible.
+ * @sma: semaphore array
+ *
+ * The function unmerges the wait queues if complex_count is 0.
+ * It must be called prior to dropping the global semaphore array lock.
+ */
+static void unmerge_queues(struct sem_array *sma)
+{
+ struct sem_queue *q, *tq;
+
+ /* complex operations still around? */
+ if (sma->complex_count)
+ return;
+ /*
+ * We will switch back to simple mode.
+ * Move all pending operation back into the per-semaphore
+ * queues.
+ */
+ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
+ struct sem *curr;
+ curr = &sma->sem_base[q->sops[0].sem_num];
+
+ list_add_tail(&q->list, &curr->pending_alter);
+ }
+ INIT_LIST_HEAD(&sma->pending_alter);
+}
+
+/**
+ * merge_queues - merge single semop queues into global queue
+ * @sma: semaphore array
+ *
+ * This function merges all per-semaphore queues into the global queue.
+ * It is necessary to achieve FIFO ordering for the pending single-sop
+ * operations when a multi-semop operation must sleep.
+ * Only the alter operations must be moved, the const operations can stay.
+ */
+static void merge_queues(struct sem_array *sma)
+{
+ int i;
+ for (i = 0; i < sma->sem_nsems; i++) {
+ struct sem *sem = sma->sem_base + i;
+
+ list_splice_init(&sem->pending_alter, &sma->pending_alter);
+ }
+}
+
+static void sem_rcu_free(struct rcu_head *head)
+{
+ struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+ struct sem_array *sma = ipc_rcu_to_struct(p);
+
+ security_sem_free(sma);
+ ipc_rcu_free(head);
+}
+
+/*
+ * Wait until all currently ongoing simple ops have completed.
+ * Caller must own sem_perm.lock.
+ * New simple ops cannot start, because simple ops first check
+ * that sem_perm.lock is free.
+ * that a) sem_perm.lock is free and b) complex_count is 0.
+ */
+static void sem_wait_array(struct sem_array *sma)
+{
+ int i;
+ struct sem *sem;
+
+ if (sma->complex_count) {
+ /* The thread that increased sma->complex_count waited on
+ * all sem->lock locks. Thus we don't need to wait again.
+ */
+ return;
+ }
+
+ for (i = 0; i < sma->sem_nsems; i++) {
+ sem = sma->sem_base + i;
+ spin_unlock_wait(&sem->lock);
+ }
+}
+
+/*
+ * If the request contains only one semaphore operation, and there are
+ * no complex transactions pending, lock only the semaphore involved.
+ * Otherwise, lock the entire semaphore array, since we either have
+ * multiple semaphores in our own semops, or we need to look at
+ * semaphores from other pending complex operations.
+ */
+static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
+ int nsops)
+{
+ struct sem *sem;
+
+ if (nsops != 1) {
+ /* Complex operation - acquire a full lock */
+ ipc_lock_object(&sma->sem_perm);
+
+ /* And wait until all simple ops that are processed
+ * right now have dropped their locks.
+ */
+ sem_wait_array(sma);
+ return -1;
+ }
+
+ /*
+ * Only one semaphore affected - try to optimize locking.
+ * The rules are:
+ * - optimized locking is possible if no complex operation
+ * is either enqueued or processed right now.
+ * - The test for enqueued complex ops is simple:
+ * sma->complex_count != 0
+ * - Testing for complex ops that are processed right now is
+ * a bit more difficult. Complex ops acquire the full lock
+ * and first wait that the running simple ops have completed.
+ * (see above)
+ * Thus: If we own a simple lock and the global lock is free
+ * and complex_count is now 0, then it will stay 0 and
+ * thus just locking sem->lock is sufficient.
+ */
+ sem = sma->sem_base + sops->sem_num;
+
+ if (sma->complex_count == 0) {
+ /*
+ * It appears that no complex operation is around.
+ * Acquire the per-semaphore lock.
+ */
+ spin_lock(&sem->lock);
+
+ /* Then check that the global lock is free */
+ if (!spin_is_locked(&sma->sem_perm.lock)) {
+ /*
+ * The ipc object lock check must be visible on all
+ * cores before rechecking the complex count. Otherwise
+ * we can race with another thread that does:
+ * complex_count++;
+ * spin_unlock(sem_perm.lock);
+ */
+ smp_rmb();
+
+ /*
+ * Now repeat the test of complex_count:
+ * It can't change anymore until we drop sem->lock.
+ * Thus: if is now 0, then it will stay 0.
+ */
+ if (sma->complex_count == 0) {
+ /* fast path successful! */
+ return sops->sem_num;
+ }
+ }
+ spin_unlock(&sem->lock);
+ }
+
+ /* slow path: acquire the full lock */
+ ipc_lock_object(&sma->sem_perm);
+
+ if (sma->complex_count == 0) {
+ /* False alarm:
+ * There is no complex operation, thus we can switch
+ * back to the fast path.
+ */
+ spin_lock(&sem->lock);
+ ipc_unlock_object(&sma->sem_perm);
+ return sops->sem_num;
+ } else {
+ /* Not a false alarm, thus complete the sequence for a
+ * full lock.
+ */
+ sem_wait_array(sma);
+ return -1;
+ }
+}
+
+static inline void sem_unlock(struct sem_array *sma, int locknum)
+{
+ if (locknum == -1) {
+ unmerge_queues(sma);
+ ipc_unlock_object(&sma->sem_perm);
+ } else {
+ struct sem *sem = sma->sem_base + locknum;
+ spin_unlock(&sem->lock);
+ }
+}
+
+/*
+ * sem_lock_(check_) routines are called in the paths where the rwsem
+ * is not held.
+ *
+ * The caller holds the RCU read lock.
+ */
+static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
+ int id, struct sembuf *sops, int nsops, int *locknum)
+{
+ struct kern_ipc_perm *ipcp;
+ struct sem_array *sma;
+
+ ipcp = ipc_obtain_object(&sem_ids(ns), id);
+ if (IS_ERR(ipcp))
+ return ERR_CAST(ipcp);
+
+ sma = container_of(ipcp, struct sem_array, sem_perm);
+ *locknum = sem_lock(sma, sops, nsops);
+
+ /* ipc_rmid() may have already freed the ID while sem_lock
+ * was spinning: verify that the structure is still valid
+ */
+ if (ipc_valid_object(ipcp))
+ return container_of(ipcp, struct sem_array, sem_perm);
+
+ sem_unlock(sma, *locknum);
+ return ERR_PTR(-EINVAL);
+}
+
+static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
+{
+ struct kern_ipc_perm *ipcp = ipc_obtain_object(&sem_ids(ns), id);
+
+ if (IS_ERR(ipcp))
+ return ERR_CAST(ipcp);
+
+ return container_of(ipcp, struct sem_array, sem_perm);
+}
+
+static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns,
+ int id)
+{
+ struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);
+
+ if (IS_ERR(ipcp))
+ return ERR_CAST(ipcp);
+
+ return container_of(ipcp, struct sem_array, sem_perm);
+}
+
+static inline void sem_lock_and_putref(struct sem_array *sma)
+{
+ sem_lock(sma, NULL, -1);
+ ipc_rcu_putref(sma, ipc_rcu_free);
+}
+
+static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
+{
+ ipc_rmid(&sem_ids(ns), &s->sem_perm);
+}
+
+/*
+ * Lockless wakeup algorithm:
+ * Without the check/retry algorithm a lockless wakeup is possible:
+ * - queue.status is initialized to -EINTR before blocking.
+ * - wakeup is performed by
+ * * unlinking the queue entry from the pending list
+ * * setting queue.status to IN_WAKEUP
+ * This is the notification for the blocked thread that a
+ * result value is imminent.
+ * * call wake_up_process
+ * * set queue.status to the final value.
+ * - the previously blocked thread checks queue.status:
+ * * if it's IN_WAKEUP, then it must wait until the value changes
+ * * if it's not -EINTR, then the operation was completed by
+ * update_queue. semtimedop can return queue.status without
+ * performing any operation on the sem array.
+ * * otherwise it must acquire the spinlock and check what's up.
+ *
+ * The two-stage algorithm is necessary to protect against the following
+ * races:
+ * - if queue.status is set after wake_up_process, then the woken up idle
+ * thread could race forward and try (and fail) to acquire sma->lock
+ * before update_queue had a chance to set queue.status
+ * - if queue.status is written before wake_up_process and if the
+ * blocked process is woken up by a signal between writing
+ * queue.status and the wake_up_process, then the woken up
+ * process could return from semtimedop and die by calling
+ * sys_exit before wake_up_process is called. Then wake_up_process
+ * will oops, because the task structure is already invalid.
+ * (yes, this happened on s390 with sysv msg).
+ *
+ */
+#define IN_WAKEUP 1
+
+/**
+ * newary - Create a new semaphore set
+ * @ns: namespace
+ * @params: ptr to the structure that contains key, semflg and nsems
+ *
+ * Called with sem_ids.rwsem held (as a writer)
+ */
+static int newary(struct ipc_namespace *ns, struct ipc_params *params)
+{
+ int id;
+ int retval;
+ struct sem_array *sma;
+ int size;
+ key_t key = params->key;
+ int nsems = params->u.nsems;
+ int semflg = params->flg;
+ int i;
+
+ if (!nsems)
+ return -EINVAL;
+ if (ns->used_sems + nsems > ns->sc_semmns)
+ return -ENOSPC;
+
+ size = sizeof(*sma) + nsems * sizeof(struct sem);
+ sma = ipc_rcu_alloc(size);
+ if (!sma)
+ return -ENOMEM;
+
+ memset(sma, 0, size);
+
+ sma->sem_perm.mode = (semflg & S_IRWXUGO);
+ sma->sem_perm.key = key;
+
+ sma->sem_perm.security = NULL;
+ retval = security_sem_alloc(sma);
+ if (retval) {
+ ipc_rcu_putref(sma, ipc_rcu_free);
+ return retval;
+ }
+
+ sma->sem_base = (struct sem *) &sma[1];
+
+ for (i = 0; i < nsems; i++) {
+ INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
+ INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
+ spin_lock_init(&sma->sem_base[i].lock);
+ }
+
+ sma->complex_count = 0;
+ INIT_LIST_HEAD(&sma->pending_alter);
+ INIT_LIST_HEAD(&sma->pending_const);
+ INIT_LIST_HEAD(&sma->list_id);
+ sma->sem_nsems = nsems;
+ sma->sem_ctime = get_seconds();
+
+ id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
+ if (id < 0) {
+ ipc_rcu_putref(sma, sem_rcu_free);
+ return id;
+ }
+ ns->used_sems += nsems;
+
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+
+ return sma->sem_perm.id;
+}
+
+
+/*
+ * Called with sem_ids.rwsem and ipcp locked.
+ */
+static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
+{
+ struct sem_array *sma;
+
+ sma = container_of(ipcp, struct sem_array, sem_perm);
+ return security_sem_associate(sma, semflg);
+}
+
+/*
+ * Called with sem_ids.rwsem and ipcp locked.
+ */
+static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
+ struct ipc_params *params)
+{
+ struct sem_array *sma;
+
+ sma = container_of(ipcp, struct sem_array, sem_perm);
+ if (params->u.nsems > sma->sem_nsems)
+ return -EINVAL;
+
+ return 0;
+}
+
+SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
+{
+ struct ipc_namespace *ns;
+ static const struct ipc_ops sem_ops = {
+ .getnew = newary,
+ .associate = sem_security,
+ .more_checks = sem_more_checks,
+ };
+ struct ipc_params sem_params;
+
+ ns = current->nsproxy->ipc_ns;
+
+ if (nsems < 0 || nsems > ns->sc_semmsl)
+ return -EINVAL;
+
+ sem_params.key = key;
+ sem_params.flg = semflg;
+ sem_params.u.nsems = nsems;
+
+ return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
+}
+
+/**
+ * perform_atomic_semop - Perform (if possible) a semaphore operation
+ * @sma: semaphore array
+ * @q: struct sem_queue that describes the operation
+ *
+ * Returns 0 if the operation was possible.
+ * Returns 1 if the operation is impossible, the caller must sleep.
+ * Negative values are error codes.
+ */
+static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
+{
+ int result, sem_op, nsops, pid;
+ struct sembuf *sop;
+ struct sem *curr;
+ struct sembuf *sops;
+ struct sem_undo *un;
+
+ sops = q->sops;
+ nsops = q->nsops;
+ un = q->undo;
+
+ for (sop = sops; sop < sops + nsops; sop++) {
+ curr = sma->sem_base + sop->sem_num;
+ sem_op = sop->sem_op;
+ result = curr->semval;
+
+ if (!sem_op && result)
+ goto would_block;
+
+ result += sem_op;
+ if (result < 0)
+ goto would_block;
+ if (result > SEMVMX)
+ goto out_of_range;
+
+ if (sop->sem_flg & SEM_UNDO) {
+ int undo = un->semadj[sop->sem_num] - sem_op;
+ /* Exceeding the undo range is an error. */
+ if (undo < (-SEMAEM - 1) || undo > SEMAEM)
+ goto out_of_range;
+ un->semadj[sop->sem_num] = undo;
+ }
+
+ curr->semval = result;
+ }
+
+ sop--;
+ pid = q->pid;
+ while (sop >= sops) {
+ sma->sem_base[sop->sem_num].sempid = pid;
+ sop--;
+ }
+
+ return 0;
+
+out_of_range:
+ result = -ERANGE;
+ goto undo;
+
+would_block:
+ q->blocking = sop;
+
+ if (sop->sem_flg & IPC_NOWAIT)
+ result = -EAGAIN;
+ else
+ result = 1;
+
+undo:
+ sop--;
+ while (sop >= sops) {
+ sem_op = sop->sem_op;
+ sma->sem_base[sop->sem_num].semval -= sem_op;
+ if (sop->sem_flg & SEM_UNDO)
+ un->semadj[sop->sem_num] += sem_op;
+ sop--;
+ }
+
+ return result;
+}
+
+/** wake_up_sem_queue_prepare(q, error): Prepare wake-up
+ * @q: queue entry that must be signaled
+ * @error: Error value for the signal
+ *
+ * Prepare the wake-up of the queue entry q.
+ */
+static void wake_up_sem_queue_prepare(struct list_head *pt,
+ struct sem_queue *q, int error)
+{
+ if (list_empty(pt)) {
+ /*
+ * Hold preempt off so that we don't get preempted and have the
+ * wakee busy-wait until we're scheduled back on.
+ */
+ preempt_disable();
+ }
+ q->status = IN_WAKEUP;
+ q->pid = error;
+
+ list_add_tail(&q->list, pt);
+}
+
+/**
+ * wake_up_sem_queue_do - do the actual wake-up
+ * @pt: list of tasks to be woken up
+ *
+ * Do the actual wake-up.
+ * The function is called without any locks held, thus the semaphore array
+ * could be destroyed already and the tasks can disappear as soon as the
+ * status is set to the actual return code.
+ */
+static void wake_up_sem_queue_do(struct list_head *pt)
+{
+ struct sem_queue *q, *t;
+ int did_something;
+
+ did_something = !list_empty(pt);
+ list_for_each_entry_safe(q, t, pt, list) {
+ wake_up_process(q->sleeper);
+ /* q can disappear immediately after writing q->status. */
+ smp_wmb();
+ q->status = q->pid;
+ }
+ if (did_something)
+ preempt_enable();
+}
+
+static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
+{
+ list_del(&q->list);
+ if (q->nsops > 1)
+ sma->complex_count--;
+}
+
+/** check_restart(sma, q)
+ * @sma: semaphore array
+ * @q: the operation that just completed
+ *
+ * update_queue is O(N^2) when it restarts scanning the whole queue of
+ * waiting operations. Therefore this function checks if the restart is
+ * really necessary. It is called after a previously waiting operation
+ * modified the array.
+ * Note that wait-for-zero operations are handled without restart.
+ */
+static int check_restart(struct sem_array *sma, struct sem_queue *q)
+{
+ /* pending complex alter operations are too difficult to analyse */
+ if (!list_empty(&sma->pending_alter))
+ return 1;
+
+ /* we were a sleeping complex operation. Too difficult */
+ if (q->nsops > 1)
+ return 1;
+
+ /* It is impossible that someone waits for the new value:
+ * - complex operations always restart.
+ * - wait-for-zero are handled seperately.
+ * - q is a previously sleeping simple operation that
+ * altered the array. It must be a decrement, because
+ * simple increments never sleep.
+ * - If there are older (higher priority) decrements
+ * in the queue, then they have observed the original
+ * semval value and couldn't proceed. The operation
+ * decremented to value - thus they won't proceed either.
+ */
+ return 0;
+}
+
+/**
+ * wake_const_ops - wake up non-alter tasks
+ * @sma: semaphore array.
+ * @semnum: semaphore that was modified.
+ * @pt: list head for the tasks that must be woken up.
+ *
+ * wake_const_ops must be called after a semaphore in a semaphore array
+ * was set to 0. If complex const operations are pending, wake_const_ops must
+ * be called with semnum = -1, as well as with the number of each modified
+ * semaphore.
+ * The tasks that must be woken up are added to @pt. The return code
+ * is stored in q->pid.
+ * The function returns 1 if at least one operation was completed successfully.
+ */
+static int wake_const_ops(struct sem_array *sma, int semnum,
+ struct list_head *pt)
+{
+ struct sem_queue *q;
+ struct list_head *walk;
+ struct list_head *pending_list;
+ int semop_completed = 0;
+
+ if (semnum == -1)
+ pending_list = &sma->pending_const;
+ else
+ pending_list = &sma->sem_base[semnum].pending_const;
+
+ walk = pending_list->next;
+ while (walk != pending_list) {
+ int error;
+
+ q = container_of(walk, struct sem_queue, list);
+ walk = walk->next;
+
+ error = perform_atomic_semop(sma, q);
+
+ if (error <= 0) {
+ /* operation completed, remove from queue & wakeup */
+
+ unlink_queue(sma, q);
+
+ wake_up_sem_queue_prepare(pt, q, error);
+ if (error == 0)
+ semop_completed = 1;
+ }
+ }
+ return semop_completed;
+}
+
+/**
+ * do_smart_wakeup_zero - wakeup all wait for zero tasks
+ * @sma: semaphore array
+ * @sops: operations that were performed
+ * @nsops: number of operations
+ * @pt: list head of the tasks that must be woken up.
+ *
+ * Checks all required queue for wait-for-zero operations, based
+ * on the actual changes that were performed on the semaphore array.
+ * The function returns 1 if at least one operation was completed successfully.
+ */
+static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
+ int nsops, struct list_head *pt)
+{
+ int i;
+ int semop_completed = 0;
+ int got_zero = 0;
+
+ /* first: the per-semaphore queues, if known */
+ if (sops) {
+ for (i = 0; i < nsops; i++) {
+ int num = sops[i].sem_num;
+
+ if (sma->sem_base[num].semval == 0) {
+ got_zero = 1;
+ semop_completed |= wake_const_ops(sma, num, pt);
+ }
+ }
+ } else {
+ /*
+ * No sops means modified semaphores not known.
+ * Assume all were changed.
+ */
+ for (i = 0; i < sma->sem_nsems; i++) {
+ if (sma->sem_base[i].semval == 0) {
+ got_zero = 1;
+ semop_completed |= wake_const_ops(sma, i, pt);
+ }
+ }
+ }
+ /*
+ * If one of the modified semaphores got 0,
+ * then check the global queue, too.
+ */
+ if (got_zero)
+ semop_completed |= wake_const_ops(sma, -1, pt);
+
+ return semop_completed;
+}
+
+
+/**
+ * update_queue - look for tasks that can be completed.
+ * @sma: semaphore array.
+ * @semnum: semaphore that was modified.
+ * @pt: list head for the tasks that must be woken up.
+ *
+ * update_queue must be called after a semaphore in a semaphore array
+ * was modified. If multiple semaphores were modified, update_queue must
+ * be called with semnum = -1, as well as with the number of each modified
+ * semaphore.
+ * The tasks that must be woken up are added to @pt. The return code
+ * is stored in q->pid.
+ * The function internally checks if const operations can now succeed.
+ *
+ * The function return 1 if at least one semop was completed successfully.
+ */
+static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
+{
+ struct sem_queue *q;
+ struct list_head *walk;
+ struct list_head *pending_list;
+ int semop_completed = 0;
+
+ if (semnum == -1)
+ pending_list = &sma->pending_alter;
+ else
+ pending_list = &sma->sem_base[semnum].pending_alter;
+
+again:
+ walk = pending_list->next;
+ while (walk != pending_list) {
+ int error, restart;
+
+ q = container_of(walk, struct sem_queue, list);
+ walk = walk->next;
+
+ /* If we are scanning the single sop, per-semaphore list of
+ * one semaphore and that semaphore is 0, then it is not
+ * necessary to scan further: simple increments
+ * that affect only one entry succeed immediately and cannot
+ * be in the per semaphore pending queue, and decrements
+ * cannot be successful if the value is already 0.
+ */
+ if (semnum != -1 && sma->sem_base[semnum].semval == 0)
+ break;
+
+ error = perform_atomic_semop(sma, q);
+
+ /* Does q->sleeper still need to sleep? */
+ if (error > 0)
+ continue;
+
+ unlink_queue(sma, q);
+
+ if (error) {
+ restart = 0;
+ } else {
+ semop_completed = 1;
+ do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
+ restart = check_restart(sma, q);
+ }
+
+ wake_up_sem_queue_prepare(pt, q, error);
+ if (restart)
+ goto again;
+ }
+ return semop_completed;
+}
+
+/**
+ * set_semotime - set sem_otime
+ * @sma: semaphore array
+ * @sops: operations that modified the array, may be NULL
+ *
+ * sem_otime is replicated to avoid cache line trashing.
+ * This function sets one instance to the current time.
+ */
+static void set_semotime(struct sem_array *sma, struct sembuf *sops)
+{
+ if (sops == NULL) {
+ sma->sem_base[0].sem_otime = get_seconds();
+ } else {
+ sma->sem_base[sops[0].sem_num].sem_otime =
+ get_seconds();
+ }
+}
+
+/**
+ * do_smart_update - optimized update_queue
+ * @sma: semaphore array
+ * @sops: operations that were performed
+ * @nsops: number of operations
+ * @otime: force setting otime
+ * @pt: list head of the tasks that must be woken up.
+ *
+ * do_smart_update() does the required calls to update_queue and wakeup_zero,
+ * based on the actual changes that were performed on the semaphore array.
+ * Note that the function does not do the actual wake-up: the caller is
+ * responsible for calling wake_up_sem_queue_do(@pt).
+ * It is safe to perform this call after dropping all locks.
+ */
+static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
+ int otime, struct list_head *pt)
+{
+ int i;
+
+ otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
+
+ if (!list_empty(&sma->pending_alter)) {
+ /* semaphore array uses the global queue - just process it. */
+ otime |= update_queue(sma, -1, pt);
+ } else {
+ if (!sops) {
+ /*
+ * No sops, thus the modified semaphores are not
+ * known. Check all.
+ */
+ for (i = 0; i < sma->sem_nsems; i++)
+ otime |= update_queue(sma, i, pt);
+ } else {
+ /*
+ * Check the semaphores that were increased:
+ * - No complex ops, thus all sleeping ops are
+ * decrease.
+ * - if we decreased the value, then any sleeping
+ * semaphore ops wont be able to run: If the
+ * previous value was too small, then the new
+ * value will be too small, too.
+ */
+ for (i = 0; i < nsops; i++) {
+ if (sops[i].sem_op > 0) {
+ otime |= update_queue(sma,
+ sops[i].sem_num, pt);
+ }
+ }
+ }
+ }
+ if (otime)
+ set_semotime(sma, sops);
+}
+
+/*
+ * check_qop: Test if a queued operation sleeps on the semaphore semnum
+ */
+static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q,
+ bool count_zero)
+{
+ struct sembuf *sop = q->blocking;
+
+ /*
+ * Linux always (since 0.99.10) reported a task as sleeping on all
+ * semaphores. This violates SUS, therefore it was changed to the
+ * standard compliant behavior.
+ * Give the administrators a chance to notice that an application
+ * might misbehave because it relies on the Linux behavior.
+ */
+ pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n"
+ "The task %s (%d) triggered the difference, watch for misbehavior.\n",
+ current->comm, task_pid_nr(current));
+
+ if (sop->sem_num != semnum)
+ return 0;
+
+ if (count_zero && sop->sem_op == 0)
+ return 1;
+ if (!count_zero && sop->sem_op < 0)
+ return 1;
+
+ return 0;
+}
+
+/* The following counts are associated to each semaphore:
+ * semncnt number of tasks waiting on semval being nonzero
+ * semzcnt number of tasks waiting on semval being zero
+ *
+ * Per definition, a task waits only on the semaphore of the first semop
+ * that cannot proceed, even if additional operation would block, too.
+ */
+static int count_semcnt(struct sem_array *sma, ushort semnum,
+ bool count_zero)
+{
+ struct list_head *l;
+ struct sem_queue *q;
+ int semcnt;
+
+ semcnt = 0;
+ /* First: check the simple operations. They are easy to evaluate */
+ if (count_zero)
+ l = &sma->sem_base[semnum].pending_const;
+ else
+ l = &sma->sem_base[semnum].pending_alter;
+
+ list_for_each_entry(q, l, list) {
+ /* all task on a per-semaphore list sleep on exactly
+ * that semaphore
+ */
+ semcnt++;
+ }
+
+ /* Then: check the complex operations. */
+ list_for_each_entry(q, &sma->pending_alter, list) {
+ semcnt += check_qop(sma, semnum, q, count_zero);
+ }
+ if (count_zero) {
+ list_for_each_entry(q, &sma->pending_const, list) {
+ semcnt += check_qop(sma, semnum, q, count_zero);
+ }
+ }
+ return semcnt;
+}
+
+/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
+ * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
+ * remains locked on exit.
+ */
+static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+{
+ struct sem_undo *un, *tu;
+ struct sem_queue *q, *tq;
+ struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
+ struct list_head tasks;
+ int i;
+
+ /* Free the existing undo structures for this semaphore set. */
+ ipc_assert_locked_object(&sma->sem_perm);
+ list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
+ list_del(&un->list_id);
+ spin_lock(&un->ulp->lock);
+ un->semid = -1;
+ list_del_rcu(&un->list_proc);
+ spin_unlock(&un->ulp->lock);
+ kfree_rcu(un, rcu);
+ }
+
+ /* Wake up all pending processes and let them fail with EIDRM. */
+ INIT_LIST_HEAD(&tasks);
+ list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
+ unlink_queue(sma, q);
+ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ }
+
+ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
+ unlink_queue(sma, q);
+ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ }
+ for (i = 0; i < sma->sem_nsems; i++) {
+ struct sem *sem = sma->sem_base + i;
+ list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
+ unlink_queue(sma, q);
+ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ }
+ list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
+ unlink_queue(sma, q);
+ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ }
+ }
+
+ /* Remove the semaphore set from the IDR */
+ sem_rmid(ns, sma);
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+
+ wake_up_sem_queue_do(&tasks);
+ ns->used_sems -= sma->sem_nsems;
+ ipc_rcu_putref(sma, sem_rcu_free);
+}
+
+static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
+{
+ switch (version) {
+ case IPC_64:
+ return copy_to_user(buf, in, sizeof(*in));
+ case IPC_OLD:
+ {
+ struct semid_ds out;
+
+ memset(&out, 0, sizeof(out));
+
+ ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
+
+ out.sem_otime = in->sem_otime;
+ out.sem_ctime = in->sem_ctime;
+ out.sem_nsems = in->sem_nsems;
+
+ return copy_to_user(buf, &out, sizeof(out));
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static time_t get_semotime(struct sem_array *sma)
+{
+ int i;
+ time_t res;
+
+ res = sma->sem_base[0].sem_otime;
+ for (i = 1; i < sma->sem_nsems; i++) {
+ time_t to = sma->sem_base[i].sem_otime;
+
+ if (to > res)
+ res = to;
+ }
+ return res;
+}
+
+static int semctl_nolock(struct ipc_namespace *ns, int semid,
+ int cmd, int version, void __user *p)
+{
+ int err;
+ struct sem_array *sma;
+
+ switch (cmd) {
+ case IPC_INFO:
+ case SEM_INFO:
+ {
+ struct seminfo seminfo;
+ int max_id;
+
+ err = security_sem_semctl(NULL, cmd);
+ if (err)
+ return err;
+
+ memset(&seminfo, 0, sizeof(seminfo));
+ seminfo.semmni = ns->sc_semmni;
+ seminfo.semmns = ns->sc_semmns;
+ seminfo.semmsl = ns->sc_semmsl;
+ seminfo.semopm = ns->sc_semopm;
+ seminfo.semvmx = SEMVMX;
+ seminfo.semmnu = SEMMNU;
+ seminfo.semmap = SEMMAP;
+ seminfo.semume = SEMUME;
+ down_read(&sem_ids(ns).rwsem);
+ if (cmd == SEM_INFO) {
+ seminfo.semusz = sem_ids(ns).in_use;
+ seminfo.semaem = ns->used_sems;
+ } else {
+ seminfo.semusz = SEMUSZ;
+ seminfo.semaem = SEMAEM;
+ }
+ max_id = ipc_get_maxid(&sem_ids(ns));
+ up_read(&sem_ids(ns).rwsem);
+ if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
+ return -EFAULT;
+ return (max_id < 0) ? 0 : max_id;
+ }
+ case IPC_STAT:
+ case SEM_STAT:
+ {
+ struct semid64_ds tbuf;
+ int id = 0;
+
+ memset(&tbuf, 0, sizeof(tbuf));
+
+ rcu_read_lock();
+ if (cmd == SEM_STAT) {
+ sma = sem_obtain_object(ns, semid);
+ if (IS_ERR(sma)) {
+ err = PTR_ERR(sma);
+ goto out_unlock;
+ }
+ id = sma->sem_perm.id;
+ } else {
+ sma = sem_obtain_object_check(ns, semid);
+ if (IS_ERR(sma)) {
+ err = PTR_ERR(sma);
+ goto out_unlock;
+ }
+ }
+
+ err = -EACCES;
+ if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
+ goto out_unlock;
+
+ err = security_sem_semctl(sma, cmd);
+ if (err)
+ goto out_unlock;
+
+ kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
+ tbuf.sem_otime = get_semotime(sma);
+ tbuf.sem_ctime = sma->sem_ctime;
+ tbuf.sem_nsems = sma->sem_nsems;
+ rcu_read_unlock();
+ if (copy_semid_to_user(p, &tbuf, version))
+ return -EFAULT;
+ return id;
+ }
+ default:
+ return -EINVAL;
+ }
+out_unlock:
+ rcu_read_unlock();
+ return err;
+}
+
+static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
+ unsigned long arg)
+{
+ struct sem_undo *un;
+ struct sem_array *sma;
+ struct sem *curr;
+ int err;
+ struct list_head tasks;
+ int val;
+#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
+ /* big-endian 64bit */
+ val = arg >> 32;
+#else
+ /* 32bit or little-endian 64bit */
+ val = arg;
+#endif
+
+ if (val > SEMVMX || val < 0)
+ return -ERANGE;
+
+ INIT_LIST_HEAD(&tasks);
+
+ rcu_read_lock();
+ sma = sem_obtain_object_check(ns, semid);
+ if (IS_ERR(sma)) {
+ rcu_read_unlock();
+ return PTR_ERR(sma);
+ }
+
+ if (semnum < 0 || semnum >= sma->sem_nsems) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+
+ if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
+ rcu_read_unlock();
+ return -EACCES;
+ }
+
+ err = security_sem_semctl(sma, SETVAL);
+ if (err) {
+ rcu_read_unlock();
+ return -EACCES;
+ }
+
+ sem_lock(sma, NULL, -1);
+
+ if (!ipc_valid_object(&sma->sem_perm)) {
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ return -EIDRM;
+ }
+
+ curr = &sma->sem_base[semnum];
+
+ ipc_assert_locked_object(&sma->sem_perm);
+ list_for_each_entry(un, &sma->list_id, list_id)
+ un->semadj[semnum] = 0;
+
+ curr->semval = val;
+ curr->sempid = task_tgid_vnr(current);
+ sma->sem_ctime = get_seconds();
+ /* maybe some queued-up processes were waiting for this */
+ do_smart_update(sma, NULL, 0, 0, &tasks);
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ wake_up_sem_queue_do(&tasks);
+ return 0;
+}
+
+static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
+ int cmd, void __user *p)
+{
+ struct sem_array *sma;
+ struct sem *curr;
+ int err, nsems;
+ ushort fast_sem_io[SEMMSL_FAST];
+ ushort *sem_io = fast_sem_io;
+ struct list_head tasks;
+
+ INIT_LIST_HEAD(&tasks);
+
+ rcu_read_lock();
+ sma = sem_obtain_object_check(ns, semid);
+ if (IS_ERR(sma)) {
+ rcu_read_unlock();
+ return PTR_ERR(sma);
+ }
+
+ nsems = sma->sem_nsems;
+
+ err = -EACCES;
+ if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
+ goto out_rcu_wakeup;
+
+ err = security_sem_semctl(sma, cmd);
+ if (err)
+ goto out_rcu_wakeup;
+
+ err = -EACCES;
+ switch (cmd) {
+ case GETALL:
+ {
+ ushort __user *array = p;
+ int i;
+
+ sem_lock(sma, NULL, -1);
+ if (!ipc_valid_object(&sma->sem_perm)) {
+ err = -EIDRM;
+ goto out_unlock;
+ }
+ if (nsems > SEMMSL_FAST) {
+ if (!ipc_rcu_getref(sma)) {
+ err = -EIDRM;
+ goto out_unlock;
+ }
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ sem_io = ipc_alloc(sizeof(ushort)*nsems);
+ if (sem_io == NULL) {
+ ipc_rcu_putref(sma, ipc_rcu_free);
+ return -ENOMEM;
+ }
+
+ rcu_read_lock();
+ sem_lock_and_putref(sma);
+ if (!ipc_valid_object(&sma->sem_perm)) {
+ err = -EIDRM;
+ goto out_unlock;
+ }
+ }
+ for (i = 0; i < sma->sem_nsems; i++)
+ sem_io[i] = sma->sem_base[i].semval;
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ err = 0;
+ if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
+ err = -EFAULT;
+ goto out_free;
+ }
+ case SETALL:
+ {
+ int i;
+ struct sem_undo *un;
+
+ if (!ipc_rcu_getref(sma)) {
+ err = -EIDRM;
+ goto out_rcu_wakeup;
+ }
+ rcu_read_unlock();
+
+ if (nsems > SEMMSL_FAST) {
+ sem_io = ipc_alloc(sizeof(ushort)*nsems);
+ if (sem_io == NULL) {
+ ipc_rcu_putref(sma, ipc_rcu_free);
+ return -ENOMEM;
+ }
+ }
+
+ if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
+ ipc_rcu_putref(sma, ipc_rcu_free);
+ err = -EFAULT;
+ goto out_free;
+ }
+
+ for (i = 0; i < nsems; i++) {
+ if (sem_io[i] > SEMVMX) {
+ ipc_rcu_putref(sma, ipc_rcu_free);
+ err = -ERANGE;
+ goto out_free;
+ }
+ }
+ rcu_read_lock();
+ sem_lock_and_putref(sma);
+ if (!ipc_valid_object(&sma->sem_perm)) {
+ err = -EIDRM;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < nsems; i++)
+ sma->sem_base[i].semval = sem_io[i];
+
+ ipc_assert_locked_object(&sma->sem_perm);
+ list_for_each_entry(un, &sma->list_id, list_id) {
+ for (i = 0; i < nsems; i++)
+ un->semadj[i] = 0;
+ }
+ sma->sem_ctime = get_seconds();
+ /* maybe some queued-up processes were waiting for this */
+ do_smart_update(sma, NULL, 0, 0, &tasks);
+ err = 0;
+ goto out_unlock;
+ }
+ /* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
+ }
+ err = -EINVAL;
+ if (semnum < 0 || semnum >= nsems)
+ goto out_rcu_wakeup;
+
+ sem_lock(sma, NULL, -1);
+ if (!ipc_valid_object(&sma->sem_perm)) {
+ err = -EIDRM;
+ goto out_unlock;
+ }
+ curr = &sma->sem_base[semnum];
+
+ switch (cmd) {
+ case GETVAL:
+ err = curr->semval;
+ goto out_unlock;
+ case GETPID:
+ err = curr->sempid;
+ goto out_unlock;
+ case GETNCNT:
+ err = count_semcnt(sma, semnum, 0);
+ goto out_unlock;
+ case GETZCNT:
+ err = count_semcnt(sma, semnum, 1);
+ goto out_unlock;
+ }
+
+out_unlock:
+ sem_unlock(sma, -1);
+out_rcu_wakeup:
+ rcu_read_unlock();
+ wake_up_sem_queue_do(&tasks);
+out_free:
+ if (sem_io != fast_sem_io)
+ ipc_free(sem_io, sizeof(ushort)*nsems);
+ return err;
+}
+
+static inline unsigned long
+copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
+{
+ switch (version) {
+ case IPC_64:
+ if (copy_from_user(out, buf, sizeof(*out)))
+ return -EFAULT;
+ return 0;
+ case IPC_OLD:
+ {
+ struct semid_ds tbuf_old;
+
+ if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
+ return -EFAULT;
+
+ out->sem_perm.uid = tbuf_old.sem_perm.uid;
+ out->sem_perm.gid = tbuf_old.sem_perm.gid;
+ out->sem_perm.mode = tbuf_old.sem_perm.mode;
+
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+/*
+ * This function handles some semctl commands which require the rwsem
+ * to be held in write mode.
+ * NOTE: no locks must be held, the rwsem is taken inside this function.
+ */
+static int semctl_down(struct ipc_namespace *ns, int semid,
+ int cmd, int version, void __user *p)
+{
+ struct sem_array *sma;
+ int err;
+ struct semid64_ds semid64;
+ struct kern_ipc_perm *ipcp;
+
+ if (cmd == IPC_SET) {
+ if (copy_semid_from_user(&semid64, p, version))
+ return -EFAULT;
+ }
+
+ down_write(&sem_ids(ns).rwsem);
+ rcu_read_lock();
+
+ ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
+ &semid64.sem_perm, 0);
+ if (IS_ERR(ipcp)) {
+ err = PTR_ERR(ipcp);
+ goto out_unlock1;
+ }
+
+ sma = container_of(ipcp, struct sem_array, sem_perm);
+
+ err = security_sem_semctl(sma, cmd);
+ if (err)
+ goto out_unlock1;
+
+ switch (cmd) {
+ case IPC_RMID:
+ sem_lock(sma, NULL, -1);
+ /* freeary unlocks the ipc object and rcu */
+ freeary(ns, ipcp);
+ goto out_up;
+ case IPC_SET:
+ sem_lock(sma, NULL, -1);
+ err = ipc_update_perm(&semid64.sem_perm, ipcp);
+ if (err)
+ goto out_unlock0;
+ sma->sem_ctime = get_seconds();
+ break;
+ default:
+ err = -EINVAL;
+ goto out_unlock1;
+ }
+
+out_unlock0:
+ sem_unlock(sma, -1);
+out_unlock1:
+ rcu_read_unlock();
+out_up:
+ up_write(&sem_ids(ns).rwsem);
+ return err;
+}
+
+SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
+{
+ int version;
+ struct ipc_namespace *ns;
+ void __user *p = (void __user *)arg;
+
+ if (semid < 0)
+ return -EINVAL;
+
+ version = ipc_parse_version(&cmd);
+ ns = current->nsproxy->ipc_ns;
+
+ switch (cmd) {
+ case IPC_INFO:
+ case SEM_INFO:
+ case IPC_STAT:
+ case SEM_STAT:
+ return semctl_nolock(ns, semid, cmd, version, p);
+ case GETALL:
+ case GETVAL:
+ case GETPID:
+ case GETNCNT:
+ case GETZCNT:
+ case SETALL:
+ return semctl_main(ns, semid, semnum, cmd, p);
+ case SETVAL:
+ return semctl_setval(ns, semid, semnum, arg);
+ case IPC_RMID:
+ case IPC_SET:
+ return semctl_down(ns, semid, cmd, version, p);
+ default:
+ return -EINVAL;
+ }
+}
+
+/* If the task doesn't already have a undo_list, then allocate one
+ * here. We guarantee there is only one thread using this undo list,
+ * and current is THE ONE
+ *
+ * If this allocation and assignment succeeds, but later
+ * portions of this code fail, there is no need to free the sem_undo_list.
+ * Just let it stay associated with the task, and it'll be freed later
+ * at exit time.
+ *
+ * This can block, so callers must hold no locks.
+ */
+static inline int get_undo_list(struct sem_undo_list **undo_listp)
+{
+ struct sem_undo_list *undo_list;
+
+ undo_list = current->sysvsem.undo_list;
+ if (!undo_list) {
+ undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
+ if (undo_list == NULL)
+ return -ENOMEM;
+ spin_lock_init(&undo_list->lock);
+ atomic_set(&undo_list->refcnt, 1);
+ INIT_LIST_HEAD(&undo_list->list_proc);
+
+ current->sysvsem.undo_list = undo_list;
+ }
+ *undo_listp = undo_list;
+ return 0;
+}
+
+static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid)
+{
+ struct sem_undo *un;
+
+ list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
+ if (un->semid == semid)
+ return un;
+ }
+ return NULL;
+}
+
+static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
+{
+ struct sem_undo *un;
+
+ assert_spin_locked(&ulp->lock);
+
+ un = __lookup_undo(ulp, semid);
+ if (un) {
+ list_del_rcu(&un->list_proc);
+ list_add_rcu(&un->list_proc, &ulp->list_proc);
+ }
+ return un;
+}
+
+/**
+ * find_alloc_undo - lookup (and if not present create) undo array
+ * @ns: namespace
+ * @semid: semaphore array id
+ *
+ * The function looks up (and if not present creates) the undo structure.
+ * The size of the undo structure depends on the size of the semaphore
+ * array, thus the alloc path is not that straightforward.
+ * Lifetime-rules: sem_undo is rcu-protected, on success, the function
+ * performs a rcu_read_lock().
+ */
+static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
+{
+ struct sem_array *sma;
+ struct sem_undo_list *ulp;
+ struct sem_undo *un, *new;
+ int nsems, error;
+
+ error = get_undo_list(&ulp);
+ if (error)
+ return ERR_PTR(error);
+
+ rcu_read_lock();
+ spin_lock(&ulp->lock);
+ un = lookup_undo(ulp, semid);
+ spin_unlock(&ulp->lock);
+ if (likely(un != NULL))
+ goto out;
+
+ /* no undo structure around - allocate one. */
+ /* step 1: figure out the size of the semaphore array */
+ sma = sem_obtain_object_check(ns, semid);
+ if (IS_ERR(sma)) {
+ rcu_read_unlock();
+ return ERR_CAST(sma);
+ }
+
+ nsems = sma->sem_nsems;
+ if (!ipc_rcu_getref(sma)) {
+ rcu_read_unlock();
+ un = ERR_PTR(-EIDRM);
+ goto out;
+ }
+ rcu_read_unlock();
+
+ /* step 2: allocate new undo structure */
+ new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
+ if (!new) {
+ ipc_rcu_putref(sma, ipc_rcu_free);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* step 3: Acquire the lock on semaphore array */
+ rcu_read_lock();
+ sem_lock_and_putref(sma);
+ if (!ipc_valid_object(&sma->sem_perm)) {
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ kfree(new);
+ un = ERR_PTR(-EIDRM);
+ goto out;
+ }
+ spin_lock(&ulp->lock);
+
+ /*
+ * step 4: check for races: did someone else allocate the undo struct?
+ */
+ un = lookup_undo(ulp, semid);
+ if (un) {
+ kfree(new);
+ goto success;
+ }
+ /* step 5: initialize & link new undo structure */
+ new->semadj = (short *) &new[1];
+ new->ulp = ulp;
+ new->semid = semid;
+ assert_spin_locked(&ulp->lock);
+ list_add_rcu(&new->list_proc, &ulp->list_proc);
+ ipc_assert_locked_object(&sma->sem_perm);
+ list_add(&new->list_id, &sma->list_id);
+ un = new;
+
+success:
+ spin_unlock(&ulp->lock);
+ sem_unlock(sma, -1);
+out:
+ return un;
+}
+
+
+/**
+ * get_queue_result - retrieve the result code from sem_queue
+ * @q: Pointer to queue structure
+ *
+ * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
+ * q->status, then we must loop until the value is replaced with the final
+ * value: This may happen if a task is woken up by an unrelated event (e.g.
+ * signal) and in parallel the task is woken up by another task because it got
+ * the requested semaphores.
+ *
+ * The function can be called with or without holding the semaphore spinlock.
+ */
+static int get_queue_result(struct sem_queue *q)
+{
+ int error;
+
+ error = q->status;
+ while (unlikely(error == IN_WAKEUP)) {
+ cpu_relax();
+ error = q->status;
+ }
+
+ return error;
+}
+
+SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
+ unsigned, nsops, const struct timespec __user *, timeout)
+{
+ int error = -EINVAL;
+ struct sem_array *sma;
+ struct sembuf fast_sops[SEMOPM_FAST];
+ struct sembuf *sops = fast_sops, *sop;
+ struct sem_undo *un;
+ int undos = 0, alter = 0, max, locknum;
+ struct sem_queue queue;
+ unsigned long jiffies_left = 0;
+ struct ipc_namespace *ns;
+ struct list_head tasks;
+
+ ns = current->nsproxy->ipc_ns;
+
+ if (nsops < 1 || semid < 0)
+ return -EINVAL;
+ if (nsops > ns->sc_semopm)
+ return -E2BIG;
+ if (nsops > SEMOPM_FAST) {
+ sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL);
+ if (sops == NULL)
+ return -ENOMEM;
+ }
+ if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+ if (timeout) {
+ struct timespec _timeout;
+ if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
+ error = -EFAULT;
+ goto out_free;
+ }
+ if (_timeout.tv_sec < 0 || _timeout.tv_nsec < 0 ||
+ _timeout.tv_nsec >= 1000000000L) {
+ error = -EINVAL;
+ goto out_free;
+ }
+ jiffies_left = timespec_to_jiffies(&_timeout);
+ }
+ max = 0;
+ for (sop = sops; sop < sops + nsops; sop++) {
+ if (sop->sem_num >= max)
+ max = sop->sem_num;
+ if (sop->sem_flg & SEM_UNDO)
+ undos = 1;
+ if (sop->sem_op != 0)
+ alter = 1;
+ }
+
+ INIT_LIST_HEAD(&tasks);
+
+ if (undos) {
+ /* On success, find_alloc_undo takes the rcu_read_lock */
+ un = find_alloc_undo(ns, semid);
+ if (IS_ERR(un)) {
+ error = PTR_ERR(un);
+ goto out_free;
+ }
+ } else {
+ un = NULL;
+ rcu_read_lock();
+ }
+
+ sma = sem_obtain_object_check(ns, semid);
+ if (IS_ERR(sma)) {
+ rcu_read_unlock();
+ error = PTR_ERR(sma);
+ goto out_free;
+ }
+
+ error = -EFBIG;
+ if (max >= sma->sem_nsems)
+ goto out_rcu_wakeup;
+
+ error = -EACCES;
+ if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
+ goto out_rcu_wakeup;
+
+ error = security_sem_semop(sma, sops, nsops, alter);
+ if (error)
+ goto out_rcu_wakeup;
+
+ error = -EIDRM;
+ locknum = sem_lock(sma, sops, nsops);
+ /*
+ * We eventually might perform the following check in a lockless
+ * fashion, considering ipc_valid_object() locking constraints.
+ * If nsops == 1 and there is no contention for sem_perm.lock, then
+ * only a per-semaphore lock is held and it's OK to proceed with the
+ * check below. More details on the fine grained locking scheme
+ * entangled here and why it's RMID race safe on comments at sem_lock()
+ */
+ if (!ipc_valid_object(&sma->sem_perm))
+ goto out_unlock_free;
+ /*
+ * semid identifiers are not unique - find_alloc_undo may have
+ * allocated an undo structure, it was invalidated by an RMID
+ * and now a new array with received the same id. Check and fail.
+ * This case can be detected checking un->semid. The existence of
+ * "un" itself is guaranteed by rcu.
+ */
+ if (un && un->semid == -1)
+ goto out_unlock_free;
+
+ queue.sops = sops;
+ queue.nsops = nsops;
+ queue.undo = un;
+ queue.pid = task_tgid_vnr(current);
+ queue.alter = alter;
+
+ error = perform_atomic_semop(sma, &queue);
+ if (error == 0) {
+ /* If the operation was successful, then do
+ * the required updates.
+ */
+ if (alter)
+ do_smart_update(sma, sops, nsops, 1, &tasks);
+ else
+ set_semotime(sma, sops);
+ }
+ if (error <= 0)
+ goto out_unlock_free;
+
+ /* We need to sleep on this operation, so we put the current
+ * task into the pending queue and go to sleep.
+ */
+
+ if (nsops == 1) {
+ struct sem *curr;
+ curr = &sma->sem_base[sops->sem_num];
+
+ if (alter) {
+ if (sma->complex_count) {
+ list_add_tail(&queue.list,
+ &sma->pending_alter);
+ } else {
+
+ list_add_tail(&queue.list,
+ &curr->pending_alter);
+ }
+ } else {
+ list_add_tail(&queue.list, &curr->pending_const);
+ }
+ } else {
+ if (!sma->complex_count)
+ merge_queues(sma);
+
+ if (alter)
+ list_add_tail(&queue.list, &sma->pending_alter);
+ else
+ list_add_tail(&queue.list, &sma->pending_const);
+
+ sma->complex_count++;
+ }
+
+ queue.status = -EINTR;
+ queue.sleeper = current;
+
+sleep_again:
+ __set_current_state(TASK_INTERRUPTIBLE);
+ sem_unlock(sma, locknum);
+ rcu_read_unlock();
+
+ if (timeout)
+ jiffies_left = schedule_timeout(jiffies_left);
+ else
+ schedule();
+
+ error = get_queue_result(&queue);
+
+ if (error != -EINTR) {
+ /* fast path: update_queue already obtained all requested
+ * resources.
+ * Perform a smp_mb(): User space could assume that semop()
+ * is a memory barrier: Without the mb(), the cpu could
+ * speculatively read in user space stale data that was
+ * overwritten by the previous owner of the semaphore.
+ */
+ smp_mb();
+
+ goto out_free;
+ }
+
+ rcu_read_lock();
+ sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum);
+
+ /*
+ * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
+ */
+ error = get_queue_result(&queue);
+
+ /*
+ * Array removed? If yes, leave without sem_unlock().
+ */
+ if (IS_ERR(sma)) {
+ rcu_read_unlock();
+ goto out_free;
+ }
+
+
+ /*
+ * If queue.status != -EINTR we are woken up by another process.
+ * Leave without unlink_queue(), but with sem_unlock().
+ */
+ if (error != -EINTR)
+ goto out_unlock_free;
+
+ /*
+ * If an interrupt occurred we have to clean up the queue
+ */
+ if (timeout && jiffies_left == 0)
+ error = -EAGAIN;
+
+ /*
+ * If the wakeup was spurious, just retry
+ */
+ if (error == -EINTR && !signal_pending(current))
+ goto sleep_again;
+
+ unlink_queue(sma, &queue);
+
+out_unlock_free:
+ sem_unlock(sma, locknum);
+out_rcu_wakeup:
+ rcu_read_unlock();
+ wake_up_sem_queue_do(&tasks);
+out_free:
+ if (sops != fast_sops)
+ kfree(sops);
+ return error;
+}
+
+SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
+ unsigned, nsops)
+{
+ return sys_semtimedop(semid, tsops, nsops, NULL);
+}
+
+/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
+ * parent and child tasks.
+ */
+
+int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct sem_undo_list *undo_list;
+ int error;
+
+ if (clone_flags & CLONE_SYSVSEM) {
+ error = get_undo_list(&undo_list);
+ if (error)
+ return error;
+ atomic_inc(&undo_list->refcnt);
+ tsk->sysvsem.undo_list = undo_list;
+ } else
+ tsk->sysvsem.undo_list = NULL;
+
+ return 0;
+}
+
+/*
+ * add semadj values to semaphores, free undo structures.
+ * undo structures are not freed when semaphore arrays are destroyed
+ * so some of them may be out of date.
+ * IMPLEMENTATION NOTE: There is some confusion over whether the
+ * set of adjustments that needs to be done should be done in an atomic
+ * manner or not. That is, if we are attempting to decrement the semval
+ * should we queue up and wait until we can do so legally?
+ * The original implementation attempted to do this (queue and wait).
+ * The current implementation does not do so. The POSIX standard
+ * and SVID should be consulted to determine what behavior is mandated.
+ */
+void exit_sem(struct task_struct *tsk)
+{
+ struct sem_undo_list *ulp;
+
+ ulp = tsk->sysvsem.undo_list;
+ if (!ulp)
+ return;
+ tsk->sysvsem.undo_list = NULL;
+
+ if (!atomic_dec_and_test(&ulp->refcnt))
+ return;
+
+ for (;;) {
+ struct sem_array *sma;
+ struct sem_undo *un;
+ struct list_head tasks;
+ int semid, i;
+
+ rcu_read_lock();
+ un = list_entry_rcu(ulp->list_proc.next,
+ struct sem_undo, list_proc);
+ if (&un->list_proc == &ulp->list_proc)
+ semid = -1;
+ else
+ semid = un->semid;
+
+ if (semid == -1) {
+ rcu_read_unlock();
+ break;
+ }
+
+ sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, un->semid);
+ /* exit_sem raced with IPC_RMID, nothing to do */
+ if (IS_ERR(sma)) {
+ rcu_read_unlock();
+ continue;
+ }
+
+ sem_lock(sma, NULL, -1);
+ /* exit_sem raced with IPC_RMID, nothing to do */
+ if (!ipc_valid_object(&sma->sem_perm)) {
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ continue;
+ }
+ un = __lookup_undo(ulp, semid);
+ if (un == NULL) {
+ /* exit_sem raced with IPC_RMID+semget() that created
+ * exactly the same semid. Nothing to do.
+ */
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ continue;
+ }
+
+ /* remove un from the linked lists */
+ ipc_assert_locked_object(&sma->sem_perm);
+ list_del(&un->list_id);
+
+ spin_lock(&ulp->lock);
+ list_del_rcu(&un->list_proc);
+ spin_unlock(&ulp->lock);
+
+ /* perform adjustments registered in un */
+ for (i = 0; i < sma->sem_nsems; i++) {
+ struct sem *semaphore = &sma->sem_base[i];
+ if (un->semadj[i]) {
+ semaphore->semval += un->semadj[i];
+ /*
+ * Range checks of the new semaphore value,
+ * not defined by sus:
+ * - Some unices ignore the undo entirely
+ * (e.g. HP UX 11i 11.22, Tru64 V5.1)
+ * - some cap the value (e.g. FreeBSD caps
+ * at 0, but doesn't enforce SEMVMX)
+ *
+ * Linux caps the semaphore value, both at 0
+ * and at SEMVMX.
+ *
+ * Manfred <manfred@colorfullife.com>
+ */
+ if (semaphore->semval < 0)
+ semaphore->semval = 0;
+ if (semaphore->semval > SEMVMX)
+ semaphore->semval = SEMVMX;
+ semaphore->sempid = task_tgid_vnr(current);
+ }
+ }
+ /* maybe some queued-up processes were waiting for this */
+ INIT_LIST_HEAD(&tasks);
+ do_smart_update(sma, NULL, 0, 1, &tasks);
+ sem_unlock(sma, -1);
+ rcu_read_unlock();
+ wake_up_sem_queue_do(&tasks);
+
+ kfree_rcu(un, rcu);
+ }
+ kfree(ulp);
+}
+
+#ifdef CONFIG_PROC_FS
+static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
+{
+ struct user_namespace *user_ns = seq_user_ns(s);
+ struct sem_array *sma = it;
+ time_t sem_otime;
+
+ /*
+ * The proc interface isn't aware of sem_lock(), it calls
+ * ipc_lock_object() directly (in sysvipc_find_ipc).
+ * In order to stay compatible with sem_lock(), we must wait until
+ * all simple semop() calls have left their critical regions.
+ */
+ sem_wait_array(sma);
+
+ sem_otime = get_semotime(sma);
+
+ seq_printf(s,
+ "%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
+ sma->sem_perm.key,
+ sma->sem_perm.id,
+ sma->sem_perm.mode,
+ sma->sem_nsems,
+ from_kuid_munged(user_ns, sma->sem_perm.uid),
+ from_kgid_munged(user_ns, sma->sem_perm.gid),
+ from_kuid_munged(user_ns, sma->sem_perm.cuid),
+ from_kgid_munged(user_ns, sma->sem_perm.cgid),
+ sem_otime,
+ sma->sem_ctime);
+
+ return 0;
+}
+#endif
diff --git a/ipc/shm.c b/ipc/shm.c
new file mode 100644
index 000000000..d2f284c1f
--- /dev/null
+++ b/ipc/shm.c
@@ -0,0 +1,1368 @@
+/*
+ * linux/ipc/shm.c
+ * Copyright (C) 1992, 1993 Krishna Balasubramanian
+ * Many improvements/fixes by Bruno Haible.
+ * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
+ * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
+ *
+ * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
+ * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
+ * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
+ * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
+ * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
+ * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
+ * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
+ *
+ * support for audit of ipc object properties and permission changes
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
+ *
+ * namespaces support
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ *
+ * Better ipc lock (kern_ipc_perm.lock) handling
+ * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
+ */
+
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/shm.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/shmem_fs.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/ptrace.h>
+#include <linux/seq_file.h>
+#include <linux/rwsem.h>
+#include <linux/nsproxy.h>
+#include <linux/mount.h>
+#include <linux/ipc_namespace.h>
+
+#include <linux/uaccess.h>
+
+#include "util.h"
+
+struct shm_file_data {
+ int id;
+ struct ipc_namespace *ns;
+ struct file *file;
+ const struct vm_operations_struct *vm_ops;
+};
+
+#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
+
+static const struct file_operations shm_file_operations;
+static const struct vm_operations_struct shm_vm_ops;
+
+#define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS])
+
+#define shm_unlock(shp) \
+ ipc_unlock(&(shp)->shm_perm)
+
+static int newseg(struct ipc_namespace *, struct ipc_params *);
+static void shm_open(struct vm_area_struct *vma);
+static void shm_close(struct vm_area_struct *vma);
+static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
+#ifdef CONFIG_PROC_FS
+static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
+#endif
+
+void shm_init_ns(struct ipc_namespace *ns)
+{
+ ns->shm_ctlmax = SHMMAX;
+ ns->shm_ctlall = SHMALL;
+ ns->shm_ctlmni = SHMMNI;
+ ns->shm_rmid_forced = 0;
+ ns->shm_tot = 0;
+ ipc_init_ids(&shm_ids(ns));
+}
+
+/*
+ * Called with shm_ids.rwsem (writer) and the shp structure locked.
+ * Only shm_ids.rwsem remains locked on exit.
+ */
+static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+{
+ struct shmid_kernel *shp;
+ shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+
+ if (shp->shm_nattch) {
+ shp->shm_perm.mode |= SHM_DEST;
+ /* Do not find it any more */
+ shp->shm_perm.key = IPC_PRIVATE;
+ shm_unlock(shp);
+ } else
+ shm_destroy(ns, shp);
+}
+
+#ifdef CONFIG_IPC_NS
+void shm_exit_ns(struct ipc_namespace *ns)
+{
+ free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
+ idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
+}
+#endif
+
+static int __init ipc_ns_init(void)
+{
+ shm_init_ns(&init_ipc_ns);
+ return 0;
+}
+
+pure_initcall(ipc_ns_init);
+
+void __init shm_init(void)
+{
+ ipc_init_proc_interface("sysvipc/shm",
+#if BITS_PER_LONG <= 32
+ " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n",
+#else
+ " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n",
+#endif
+ IPC_SHM_IDS, sysvipc_shm_proc_show);
+}
+
+static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
+{
+ struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id);
+
+ if (IS_ERR(ipcp))
+ return ERR_CAST(ipcp);
+
+ return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
+static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
+{
+ struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);
+
+ if (IS_ERR(ipcp))
+ return ERR_CAST(ipcp);
+
+ return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
+/*
+ * shm_lock_(check_) routines are called in the paths where the rwsem
+ * is not necessarily held.
+ */
+static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
+{
+ struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
+
+ if (IS_ERR(ipcp))
+ return (struct shmid_kernel *)ipcp;
+
+ return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
+static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
+{
+ rcu_read_lock();
+ ipc_lock_object(&ipcp->shm_perm);
+}
+
+static void shm_rcu_free(struct rcu_head *head)
+{
+ struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+ struct shmid_kernel *shp = ipc_rcu_to_struct(p);
+
+ security_shm_free(shp);
+ ipc_rcu_free(head);
+}
+
+static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
+{
+ list_del(&s->shm_clist);
+ ipc_rmid(&shm_ids(ns), &s->shm_perm);
+}
+
+
+/* This is called by fork, once for every shm attach. */
+static void shm_open(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct shm_file_data *sfd = shm_file_data(file);
+ struct shmid_kernel *shp;
+
+ shp = shm_lock(sfd->ns, sfd->id);
+ BUG_ON(IS_ERR(shp));
+ shp->shm_atim = get_seconds();
+ shp->shm_lprid = task_tgid_vnr(current);
+ shp->shm_nattch++;
+ shm_unlock(shp);
+}
+
+/*
+ * shm_destroy - free the struct shmid_kernel
+ *
+ * @ns: namespace
+ * @shp: struct to free
+ *
+ * It has to be called with shp and shm_ids.rwsem (writer) locked,
+ * but returns with shp unlocked and freed.
+ */
+static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+{
+ struct file *shm_file;
+
+ shm_file = shp->shm_file;
+ shp->shm_file = NULL;
+ ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ shm_rmid(ns, shp);
+ shm_unlock(shp);
+ if (!is_file_hugepages(shm_file))
+ shmem_lock(shm_file, 0, shp->mlock_user);
+ else if (shp->mlock_user)
+ user_shm_unlock(i_size_read(file_inode(shm_file)),
+ shp->mlock_user);
+ fput(shm_file);
+ ipc_rcu_putref(shp, shm_rcu_free);
+}
+
+/*
+ * shm_may_destroy - identifies whether shm segment should be destroyed now
+ *
+ * Returns true if and only if there are no active users of the segment and
+ * one of the following is true:
+ *
+ * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
+ *
+ * 2) sysctl kernel.shm_rmid_forced is set to 1.
+ */
+static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+{
+ return (shp->shm_nattch == 0) &&
+ (ns->shm_rmid_forced ||
+ (shp->shm_perm.mode & SHM_DEST));
+}
+
+/*
+ * remove the attach descriptor vma.
+ * free memory for segment if it is marked destroyed.
+ * The descriptor has already been removed from the current->mm->mmap list
+ * and will later be kfree()d.
+ */
+static void shm_close(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct shm_file_data *sfd = shm_file_data(file);
+ struct shmid_kernel *shp;
+ struct ipc_namespace *ns = sfd->ns;
+
+ down_write(&shm_ids(ns).rwsem);
+ /* remove from the list of attaches of the shm segment */
+ shp = shm_lock(ns, sfd->id);
+ BUG_ON(IS_ERR(shp));
+ shp->shm_lprid = task_tgid_vnr(current);
+ shp->shm_dtim = get_seconds();
+ shp->shm_nattch--;
+ if (shm_may_destroy(ns, shp))
+ shm_destroy(ns, shp);
+ else
+ shm_unlock(shp);
+ up_write(&shm_ids(ns).rwsem);
+}
+
+/* Called with ns->shm_ids(ns).rwsem locked */
+static int shm_try_destroy_orphaned(int id, void *p, void *data)
+{
+ struct ipc_namespace *ns = data;
+ struct kern_ipc_perm *ipcp = p;
+ struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+
+ /*
+ * We want to destroy segments without users and with already
+ * exit'ed originating process.
+ *
+ * As shp->* are changed under rwsem, it's safe to skip shp locking.
+ */
+ if (shp->shm_creator != NULL)
+ return 0;
+
+ if (shm_may_destroy(ns, shp)) {
+ shm_lock_by_ptr(shp);
+ shm_destroy(ns, shp);
+ }
+ return 0;
+}
+
+void shm_destroy_orphaned(struct ipc_namespace *ns)
+{
+ down_write(&shm_ids(ns).rwsem);
+ if (shm_ids(ns).in_use)
+ idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
+ up_write(&shm_ids(ns).rwsem);
+}
+
+/* Locking assumes this will only be called with task == current */
+void exit_shm(struct task_struct *task)
+{
+ struct ipc_namespace *ns = task->nsproxy->ipc_ns;
+ struct shmid_kernel *shp, *n;
+
+ if (list_empty(&task->sysvshm.shm_clist))
+ return;
+
+ /*
+ * If kernel.shm_rmid_forced is not set then only keep track of
+ * which shmids are orphaned, so that a later set of the sysctl
+ * can clean them up.
+ */
+ if (!ns->shm_rmid_forced) {
+ down_read(&shm_ids(ns).rwsem);
+ list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
+ shp->shm_creator = NULL;
+ /*
+ * Only under read lock but we are only called on current
+ * so no entry on the list will be shared.
+ */
+ list_del(&task->sysvshm.shm_clist);
+ up_read(&shm_ids(ns).rwsem);
+ return;
+ }
+
+ /*
+ * Destroy all already created segments, that were not yet mapped,
+ * and mark any mapped as orphan to cover the sysctl toggling.
+ * Destroy is skipped if shm_may_destroy() returns false.
+ */
+ down_write(&shm_ids(ns).rwsem);
+ list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
+ shp->shm_creator = NULL;
+
+ if (shm_may_destroy(ns, shp)) {
+ shm_lock_by_ptr(shp);
+ shm_destroy(ns, shp);
+ }
+ }
+
+ /* Remove the list head from any segments still attached. */
+ list_del(&task->sysvshm.shm_clist);
+ up_write(&shm_ids(ns).rwsem);
+}
+
+static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct file *file = vma->vm_file;
+ struct shm_file_data *sfd = shm_file_data(file);
+
+ return sfd->vm_ops->fault(vma, vmf);
+}
+
+#ifdef CONFIG_NUMA
+static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+{
+ struct file *file = vma->vm_file;
+ struct shm_file_data *sfd = shm_file_data(file);
+ int err = 0;
+ if (sfd->vm_ops->set_policy)
+ err = sfd->vm_ops->set_policy(vma, new);
+ return err;
+}
+
+static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct file *file = vma->vm_file;
+ struct shm_file_data *sfd = shm_file_data(file);
+ struct mempolicy *pol = NULL;
+
+ if (sfd->vm_ops->get_policy)
+ pol = sfd->vm_ops->get_policy(vma, addr);
+ else if (vma->vm_policy)
+ pol = vma->vm_policy;
+
+ return pol;
+}
+#endif
+
+static int shm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct shm_file_data *sfd = shm_file_data(file);
+ int ret;
+
+ ret = sfd->file->f_op->mmap(sfd->file, vma);
+ if (ret != 0)
+ return ret;
+ sfd->vm_ops = vma->vm_ops;
+#ifdef CONFIG_MMU
+ BUG_ON(!sfd->vm_ops->fault);
+#endif
+ vma->vm_ops = &shm_vm_ops;
+ shm_open(vma);
+
+ return ret;
+}
+
+static int shm_release(struct inode *ino, struct file *file)
+{
+ struct shm_file_data *sfd = shm_file_data(file);
+
+ put_ipc_ns(sfd->ns);
+ shm_file_data(file) = NULL;
+ kfree(sfd);
+ return 0;
+}
+
+static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct shm_file_data *sfd = shm_file_data(file);
+
+ if (!sfd->file->f_op->fsync)
+ return -EINVAL;
+ return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
+}
+
+static long shm_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct shm_file_data *sfd = shm_file_data(file);
+
+ if (!sfd->file->f_op->fallocate)
+ return -EOPNOTSUPP;
+ return sfd->file->f_op->fallocate(file, mode, offset, len);
+}
+
+static unsigned long shm_get_unmapped_area(struct file *file,
+ unsigned long addr, unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ struct shm_file_data *sfd = shm_file_data(file);
+ return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
+ pgoff, flags);
+}
+
+static const struct file_operations shm_file_operations = {
+ .mmap = shm_mmap,
+ .fsync = shm_fsync,
+ .release = shm_release,
+#ifndef CONFIG_MMU
+ .get_unmapped_area = shm_get_unmapped_area,
+#endif
+ .llseek = noop_llseek,
+ .fallocate = shm_fallocate,
+};
+
+static const struct file_operations shm_file_operations_huge = {
+ .mmap = shm_mmap,
+ .fsync = shm_fsync,
+ .release = shm_release,
+ .get_unmapped_area = shm_get_unmapped_area,
+ .llseek = noop_llseek,
+ .fallocate = shm_fallocate,
+};
+
+int is_file_shm_hugepages(struct file *file)
+{
+ return file->f_op == &shm_file_operations_huge;
+}
+
+static const struct vm_operations_struct shm_vm_ops = {
+ .open = shm_open, /* callback for a new vm-area open */
+ .close = shm_close, /* callback for when the vm-area is released */
+ .fault = shm_fault,
+#if defined(CONFIG_NUMA)
+ .set_policy = shm_set_policy,
+ .get_policy = shm_get_policy,
+#endif
+};
+
+/**
+ * newseg - Create a new shared memory segment
+ * @ns: namespace
+ * @params: ptr to the structure that contains key, size and shmflg
+ *
+ * Called with shm_ids.rwsem held as a writer.
+ */
+static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
+{
+ key_t key = params->key;
+ int shmflg = params->flg;
+ size_t size = params->u.size;
+ int error;
+ struct shmid_kernel *shp;
+ size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ struct file *file;
+ char name[13];
+ int id;
+ vm_flags_t acctflag = 0;
+
+ if (size < SHMMIN || size > ns->shm_ctlmax)
+ return -EINVAL;
+
+ if (numpages << PAGE_SHIFT < size)
+ return -ENOSPC;
+
+ if (ns->shm_tot + numpages < ns->shm_tot ||
+ ns->shm_tot + numpages > ns->shm_ctlall)
+ return -ENOSPC;
+
+ shp = ipc_rcu_alloc(sizeof(*shp));
+ if (!shp)
+ return -ENOMEM;
+
+ shp->shm_perm.key = key;
+ shp->shm_perm.mode = (shmflg & S_IRWXUGO);
+ shp->mlock_user = NULL;
+
+ shp->shm_perm.security = NULL;
+ error = security_shm_alloc(shp);
+ if (error) {
+ ipc_rcu_putref(shp, ipc_rcu_free);
+ return error;
+ }
+
+ sprintf(name, "SYSV%08x", key);
+ if (shmflg & SHM_HUGETLB) {
+ struct hstate *hs;
+ size_t hugesize;
+
+ hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
+ if (!hs) {
+ error = -EINVAL;
+ goto no_file;
+ }
+ hugesize = ALIGN(size, huge_page_size(hs));
+
+ /* hugetlb_file_setup applies strict accounting */
+ if (shmflg & SHM_NORESERVE)
+ acctflag = VM_NORESERVE;
+ file = hugetlb_file_setup(name, hugesize, acctflag,
+ &shp->mlock_user, HUGETLB_SHMFS_INODE,
+ (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
+ } else {
+ /*
+ * Do not allow no accounting for OVERCOMMIT_NEVER, even
+ * if it's asked for.
+ */
+ if ((shmflg & SHM_NORESERVE) &&
+ sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ acctflag = VM_NORESERVE;
+ file = shmem_file_setup(name, size, acctflag, 0);
+ }
+ error = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto no_file;
+
+ id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
+ if (id < 0) {
+ error = id;
+ goto no_id;
+ }
+
+ shp->shm_cprid = task_tgid_vnr(current);
+ shp->shm_lprid = 0;
+ shp->shm_atim = shp->shm_dtim = 0;
+ shp->shm_ctim = get_seconds();
+ shp->shm_segsz = size;
+ shp->shm_nattch = 0;
+ shp->shm_file = file;
+ shp->shm_creator = current;
+ list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
+
+ /*
+ * shmid gets reported as "inode#" in /proc/pid/maps.
+ * proc-ps tools use this. Changing this will break them.
+ */
+ file_inode(file)->i_ino = shp->shm_perm.id;
+
+ ns->shm_tot += numpages;
+ error = shp->shm_perm.id;
+
+ ipc_unlock_object(&shp->shm_perm);
+ rcu_read_unlock();
+ return error;
+
+no_id:
+ if (is_file_hugepages(file) && shp->mlock_user)
+ user_shm_unlock(size, shp->mlock_user);
+ fput(file);
+no_file:
+ ipc_rcu_putref(shp, shm_rcu_free);
+ return error;
+}
+
+/*
+ * Called with shm_ids.rwsem and ipcp locked.
+ */
+static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
+{
+ struct shmid_kernel *shp;
+
+ shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ return security_shm_associate(shp, shmflg);
+}
+
+/*
+ * Called with shm_ids.rwsem and ipcp locked.
+ */
+static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
+ struct ipc_params *params)
+{
+ struct shmid_kernel *shp;
+
+ shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ if (shp->shm_segsz < params->u.size)
+ return -EINVAL;
+
+ return 0;
+}
+
+SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
+{
+ struct ipc_namespace *ns;
+ static const struct ipc_ops shm_ops = {
+ .getnew = newseg,
+ .associate = shm_security,
+ .more_checks = shm_more_checks,
+ };
+ struct ipc_params shm_params;
+
+ ns = current->nsproxy->ipc_ns;
+
+ shm_params.key = key;
+ shm_params.flg = shmflg;
+ shm_params.u.size = size;
+
+ return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
+}
+
+static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
+{
+ switch (version) {
+ case IPC_64:
+ return copy_to_user(buf, in, sizeof(*in));
+ case IPC_OLD:
+ {
+ struct shmid_ds out;
+
+ memset(&out, 0, sizeof(out));
+ ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
+ out.shm_segsz = in->shm_segsz;
+ out.shm_atime = in->shm_atime;
+ out.shm_dtime = in->shm_dtime;
+ out.shm_ctime = in->shm_ctime;
+ out.shm_cpid = in->shm_cpid;
+ out.shm_lpid = in->shm_lpid;
+ out.shm_nattch = in->shm_nattch;
+
+ return copy_to_user(buf, &out, sizeof(out));
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static inline unsigned long
+copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
+{
+ switch (version) {
+ case IPC_64:
+ if (copy_from_user(out, buf, sizeof(*out)))
+ return -EFAULT;
+ return 0;
+ case IPC_OLD:
+ {
+ struct shmid_ds tbuf_old;
+
+ if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
+ return -EFAULT;
+
+ out->shm_perm.uid = tbuf_old.shm_perm.uid;
+ out->shm_perm.gid = tbuf_old.shm_perm.gid;
+ out->shm_perm.mode = tbuf_old.shm_perm.mode;
+
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
+{
+ switch (version) {
+ case IPC_64:
+ return copy_to_user(buf, in, sizeof(*in));
+ case IPC_OLD:
+ {
+ struct shminfo out;
+
+ if (in->shmmax > INT_MAX)
+ out.shmmax = INT_MAX;
+ else
+ out.shmmax = (int)in->shmmax;
+
+ out.shmmin = in->shmmin;
+ out.shmmni = in->shmmni;
+ out.shmseg = in->shmseg;
+ out.shmall = in->shmall;
+
+ return copy_to_user(buf, &out, sizeof(out));
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+/*
+ * Calculate and add used RSS and swap pages of a shm.
+ * Called with shm_ids.rwsem held as a reader
+ */
+static void shm_add_rss_swap(struct shmid_kernel *shp,
+ unsigned long *rss_add, unsigned long *swp_add)
+{
+ struct inode *inode;
+
+ inode = file_inode(shp->shm_file);
+
+ if (is_file_hugepages(shp->shm_file)) {
+ struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_file(shp->shm_file);
+ *rss_add += pages_per_huge_page(h) * mapping->nrpages;
+ } else {
+#ifdef CONFIG_SHMEM
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ spin_lock(&info->lock);
+ *rss_add += inode->i_mapping->nrpages;
+ *swp_add += info->swapped;
+ spin_unlock(&info->lock);
+#else
+ *rss_add += inode->i_mapping->nrpages;
+#endif
+ }
+}
+
+/*
+ * Called with shm_ids.rwsem held as a reader
+ */
+static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
+ unsigned long *swp)
+{
+ int next_id;
+ int total, in_use;
+
+ *rss = 0;
+ *swp = 0;
+
+ in_use = shm_ids(ns).in_use;
+
+ for (total = 0, next_id = 0; total < in_use; next_id++) {
+ struct kern_ipc_perm *ipc;
+ struct shmid_kernel *shp;
+
+ ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
+ if (ipc == NULL)
+ continue;
+ shp = container_of(ipc, struct shmid_kernel, shm_perm);
+
+ shm_add_rss_swap(shp, rss, swp);
+
+ total++;
+ }
+}
+
+/*
+ * This function handles some shmctl commands which require the rwsem
+ * to be held in write mode.
+ * NOTE: no locks must be held, the rwsem is taken inside this function.
+ */
+static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
+ struct shmid_ds __user *buf, int version)
+{
+ struct kern_ipc_perm *ipcp;
+ struct shmid64_ds shmid64;
+ struct shmid_kernel *shp;
+ int err;
+
+ if (cmd == IPC_SET) {
+ if (copy_shmid_from_user(&shmid64, buf, version))
+ return -EFAULT;
+ }
+
+ down_write(&shm_ids(ns).rwsem);
+ rcu_read_lock();
+
+ ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
+ &shmid64.shm_perm, 0);
+ if (IS_ERR(ipcp)) {
+ err = PTR_ERR(ipcp);
+ goto out_unlock1;
+ }
+
+ shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+
+ err = security_shm_shmctl(shp, cmd);
+ if (err)
+ goto out_unlock1;
+
+ switch (cmd) {
+ case IPC_RMID:
+ ipc_lock_object(&shp->shm_perm);
+ /* do_shm_rmid unlocks the ipc object and rcu */
+ do_shm_rmid(ns, ipcp);
+ goto out_up;
+ case IPC_SET:
+ ipc_lock_object(&shp->shm_perm);
+ err = ipc_update_perm(&shmid64.shm_perm, ipcp);
+ if (err)
+ goto out_unlock0;
+ shp->shm_ctim = get_seconds();
+ break;
+ default:
+ err = -EINVAL;
+ goto out_unlock1;
+ }
+
+out_unlock0:
+ ipc_unlock_object(&shp->shm_perm);
+out_unlock1:
+ rcu_read_unlock();
+out_up:
+ up_write(&shm_ids(ns).rwsem);
+ return err;
+}
+
+static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
+ int cmd, int version, void __user *buf)
+{
+ int err;
+ struct shmid_kernel *shp;
+
+ /* preliminary security checks for *_INFO */
+ if (cmd == IPC_INFO || cmd == SHM_INFO) {
+ err = security_shm_shmctl(NULL, cmd);
+ if (err)
+ return err;
+ }
+
+ switch (cmd) {
+ case IPC_INFO:
+ {
+ struct shminfo64 shminfo;
+
+ memset(&shminfo, 0, sizeof(shminfo));
+ shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
+ shminfo.shmmax = ns->shm_ctlmax;
+ shminfo.shmall = ns->shm_ctlall;
+
+ shminfo.shmmin = SHMMIN;
+ if (copy_shminfo_to_user(buf, &shminfo, version))
+ return -EFAULT;
+
+ down_read(&shm_ids(ns).rwsem);
+ err = ipc_get_maxid(&shm_ids(ns));
+ up_read(&shm_ids(ns).rwsem);
+
+ if (err < 0)
+ err = 0;
+ goto out;
+ }
+ case SHM_INFO:
+ {
+ struct shm_info shm_info;
+
+ memset(&shm_info, 0, sizeof(shm_info));
+ down_read(&shm_ids(ns).rwsem);
+ shm_info.used_ids = shm_ids(ns).in_use;
+ shm_get_stat(ns, &shm_info.shm_rss, &shm_info.shm_swp);
+ shm_info.shm_tot = ns->shm_tot;
+ shm_info.swap_attempts = 0;
+ shm_info.swap_successes = 0;
+ err = ipc_get_maxid(&shm_ids(ns));
+ up_read(&shm_ids(ns).rwsem);
+ if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ err = err < 0 ? 0 : err;
+ goto out;
+ }
+ case SHM_STAT:
+ case IPC_STAT:
+ {
+ struct shmid64_ds tbuf;
+ int result;
+
+ rcu_read_lock();
+ if (cmd == SHM_STAT) {
+ shp = shm_obtain_object(ns, shmid);
+ if (IS_ERR(shp)) {
+ err = PTR_ERR(shp);
+ goto out_unlock;
+ }
+ result = shp->shm_perm.id;
+ } else {
+ shp = shm_obtain_object_check(ns, shmid);
+ if (IS_ERR(shp)) {
+ err = PTR_ERR(shp);
+ goto out_unlock;
+ }
+ result = 0;
+ }
+
+ err = -EACCES;
+ if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
+ goto out_unlock;
+
+ err = security_shm_shmctl(shp, cmd);
+ if (err)
+ goto out_unlock;
+
+ memset(&tbuf, 0, sizeof(tbuf));
+ kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
+ tbuf.shm_segsz = shp->shm_segsz;
+ tbuf.shm_atime = shp->shm_atim;
+ tbuf.shm_dtime = shp->shm_dtim;
+ tbuf.shm_ctime = shp->shm_ctim;
+ tbuf.shm_cpid = shp->shm_cprid;
+ tbuf.shm_lpid = shp->shm_lprid;
+ tbuf.shm_nattch = shp->shm_nattch;
+ rcu_read_unlock();
+
+ if (copy_shmid_to_user(buf, &tbuf, version))
+ err = -EFAULT;
+ else
+ err = result;
+ goto out;
+ }
+ default:
+ return -EINVAL;
+ }
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ return err;
+}
+
+SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
+{
+ struct shmid_kernel *shp;
+ int err, version;
+ struct ipc_namespace *ns;
+
+ if (cmd < 0 || shmid < 0)
+ return -EINVAL;
+
+ version = ipc_parse_version(&cmd);
+ ns = current->nsproxy->ipc_ns;
+
+ switch (cmd) {
+ case IPC_INFO:
+ case SHM_INFO:
+ case SHM_STAT:
+ case IPC_STAT:
+ return shmctl_nolock(ns, shmid, cmd, version, buf);
+ case IPC_RMID:
+ case IPC_SET:
+ return shmctl_down(ns, shmid, cmd, buf, version);
+ case SHM_LOCK:
+ case SHM_UNLOCK:
+ {
+ struct file *shm_file;
+
+ rcu_read_lock();
+ shp = shm_obtain_object_check(ns, shmid);
+ if (IS_ERR(shp)) {
+ err = PTR_ERR(shp);
+ goto out_unlock1;
+ }
+
+ audit_ipc_obj(&(shp->shm_perm));
+ err = security_shm_shmctl(shp, cmd);
+ if (err)
+ goto out_unlock1;
+
+ ipc_lock_object(&shp->shm_perm);
+
+ /* check if shm_destroy() is tearing down shp */
+ if (!ipc_valid_object(&shp->shm_perm)) {
+ err = -EIDRM;
+ goto out_unlock0;
+ }
+
+ if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
+ kuid_t euid = current_euid();
+ if (!uid_eq(euid, shp->shm_perm.uid) &&
+ !uid_eq(euid, shp->shm_perm.cuid)) {
+ err = -EPERM;
+ goto out_unlock0;
+ }
+ if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
+ err = -EPERM;
+ goto out_unlock0;
+ }
+ }
+
+ shm_file = shp->shm_file;
+ if (is_file_hugepages(shm_file))
+ goto out_unlock0;
+
+ if (cmd == SHM_LOCK) {
+ struct user_struct *user = current_user();
+ err = shmem_lock(shm_file, 1, user);
+ if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
+ shp->shm_perm.mode |= SHM_LOCKED;
+ shp->mlock_user = user;
+ }
+ goto out_unlock0;
+ }
+
+ /* SHM_UNLOCK */
+ if (!(shp->shm_perm.mode & SHM_LOCKED))
+ goto out_unlock0;
+ shmem_lock(shm_file, 0, shp->mlock_user);
+ shp->shm_perm.mode &= ~SHM_LOCKED;
+ shp->mlock_user = NULL;
+ get_file(shm_file);
+ ipc_unlock_object(&shp->shm_perm);
+ rcu_read_unlock();
+ shmem_unlock_mapping(shm_file->f_mapping);
+
+ fput(shm_file);
+ return err;
+ }
+ default:
+ return -EINVAL;
+ }
+
+out_unlock0:
+ ipc_unlock_object(&shp->shm_perm);
+out_unlock1:
+ rcu_read_unlock();
+ return err;
+}
+
+/*
+ * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
+ *
+ * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
+ * "raddr" thing points to kernel space, and there has to be a wrapper around
+ * this.
+ */
+long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
+ unsigned long shmlba)
+{
+ struct shmid_kernel *shp;
+ unsigned long addr;
+ unsigned long size;
+ struct file *file;
+ int err;
+ unsigned long flags;
+ unsigned long prot;
+ int acc_mode;
+ struct ipc_namespace *ns;
+ struct shm_file_data *sfd;
+ struct path path;
+ fmode_t f_mode;
+ unsigned long populate = 0;
+
+ err = -EINVAL;
+ if (shmid < 0)
+ goto out;
+ else if ((addr = (ulong)shmaddr)) {
+ if (addr & (shmlba - 1)) {
+ if (shmflg & SHM_RND)
+ addr &= ~(shmlba - 1); /* round down */
+ else
+#ifndef __ARCH_FORCE_SHMLBA
+ if (addr & ~PAGE_MASK)
+#endif
+ goto out;
+ }
+ flags = MAP_SHARED | MAP_FIXED;
+ } else {
+ if ((shmflg & SHM_REMAP))
+ goto out;
+
+ flags = MAP_SHARED;
+ }
+
+ if (shmflg & SHM_RDONLY) {
+ prot = PROT_READ;
+ acc_mode = S_IRUGO;
+ f_mode = FMODE_READ;
+ } else {
+ prot = PROT_READ | PROT_WRITE;
+ acc_mode = S_IRUGO | S_IWUGO;
+ f_mode = FMODE_READ | FMODE_WRITE;
+ }
+ if (shmflg & SHM_EXEC) {
+ prot |= PROT_EXEC;
+ acc_mode |= S_IXUGO;
+ }
+
+ /*
+ * We cannot rely on the fs check since SYSV IPC does have an
+ * additional creator id...
+ */
+ ns = current->nsproxy->ipc_ns;
+ rcu_read_lock();
+ shp = shm_obtain_object_check(ns, shmid);
+ if (IS_ERR(shp)) {
+ err = PTR_ERR(shp);
+ goto out_unlock;
+ }
+
+ err = -EACCES;
+ if (ipcperms(ns, &shp->shm_perm, acc_mode))
+ goto out_unlock;
+
+ err = security_shm_shmat(shp, shmaddr, shmflg);
+ if (err)
+ goto out_unlock;
+
+ ipc_lock_object(&shp->shm_perm);
+
+ /* check if shm_destroy() is tearing down shp */
+ if (!ipc_valid_object(&shp->shm_perm)) {
+ ipc_unlock_object(&shp->shm_perm);
+ err = -EIDRM;
+ goto out_unlock;
+ }
+
+ path = shp->shm_file->f_path;
+ path_get(&path);
+ shp->shm_nattch++;
+ size = i_size_read(d_inode(path.dentry));
+ ipc_unlock_object(&shp->shm_perm);
+ rcu_read_unlock();
+
+ err = -ENOMEM;
+ sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
+ if (!sfd) {
+ path_put(&path);
+ goto out_nattch;
+ }
+
+ file = alloc_file(&path, f_mode,
+ is_file_hugepages(shp->shm_file) ?
+ &shm_file_operations_huge :
+ &shm_file_operations);
+ err = PTR_ERR(file);
+ if (IS_ERR(file)) {
+ kfree(sfd);
+ path_put(&path);
+ goto out_nattch;
+ }
+
+ file->private_data = sfd;
+ file->f_mapping = shp->shm_file->f_mapping;
+ sfd->id = shp->shm_perm.id;
+ sfd->ns = get_ipc_ns(ns);
+ sfd->file = shp->shm_file;
+ sfd->vm_ops = NULL;
+
+ err = security_mmap_file(file, prot, flags);
+ if (err)
+ goto out_fput;
+
+ down_write(&current->mm->mmap_sem);
+ if (addr && !(shmflg & SHM_REMAP)) {
+ err = -EINVAL;
+ if (addr + size < addr)
+ goto invalid;
+
+ if (find_vma_intersection(current->mm, addr, addr + size))
+ goto invalid;
+ }
+
+ addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
+ *raddr = addr;
+ err = 0;
+ if (IS_ERR_VALUE(addr))
+ err = (long)addr;
+invalid:
+ up_write(&current->mm->mmap_sem);
+ if (populate)
+ mm_populate(addr, populate);
+
+out_fput:
+ fput(file);
+
+out_nattch:
+ down_write(&shm_ids(ns).rwsem);
+ shp = shm_lock(ns, shmid);
+ BUG_ON(IS_ERR(shp));
+ shp->shm_nattch--;
+ if (shm_may_destroy(ns, shp))
+ shm_destroy(ns, shp);
+ else
+ shm_unlock(shp);
+ up_write(&shm_ids(ns).rwsem);
+ return err;
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ return err;
+}
+
+SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
+{
+ unsigned long ret;
+ long err;
+
+ err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
+ if (err)
+ return err;
+ force_successful_syscall_return();
+ return (long)ret;
+}
+
+/*
+ * detach and kill segment if marked destroyed.
+ * The work is done in shm_close.
+ */
+SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long addr = (unsigned long)shmaddr;
+ int retval = -EINVAL;
+#ifdef CONFIG_MMU
+ loff_t size = 0;
+ struct file *file;
+ struct vm_area_struct *next;
+#endif
+
+ if (addr & ~PAGE_MASK)
+ return retval;
+
+ down_write(&mm->mmap_sem);
+
+ /*
+ * This function tries to be smart and unmap shm segments that
+ * were modified by partial mlock or munmap calls:
+ * - It first determines the size of the shm segment that should be
+ * unmapped: It searches for a vma that is backed by shm and that
+ * started at address shmaddr. It records it's size and then unmaps
+ * it.
+ * - Then it unmaps all shm vmas that started at shmaddr and that
+ * are within the initially determined size and that are from the
+ * same shm segment from which we determined the size.
+ * Errors from do_munmap are ignored: the function only fails if
+ * it's called with invalid parameters or if it's called to unmap
+ * a part of a vma. Both calls in this function are for full vmas,
+ * the parameters are directly copied from the vma itself and always
+ * valid - therefore do_munmap cannot fail. (famous last words?)
+ */
+ /*
+ * If it had been mremap()'d, the starting address would not
+ * match the usual checks anyway. So assume all vma's are
+ * above the starting address given.
+ */
+ vma = find_vma(mm, addr);
+
+#ifdef CONFIG_MMU
+ while (vma) {
+ next = vma->vm_next;
+
+ /*
+ * Check if the starting address would match, i.e. it's
+ * a fragment created by mprotect() and/or munmap(), or it
+ * otherwise it starts at this address with no hassles.
+ */
+ if ((vma->vm_ops == &shm_vm_ops) &&
+ (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
+
+ /*
+ * Record the file of the shm segment being
+ * unmapped. With mremap(), someone could place
+ * page from another segment but with equal offsets
+ * in the range we are unmapping.
+ */
+ file = vma->vm_file;
+ size = i_size_read(file_inode(vma->vm_file));
+ do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+ /*
+ * We discovered the size of the shm segment, so
+ * break out of here and fall through to the next
+ * loop that uses the size information to stop
+ * searching for matching vma's.
+ */
+ retval = 0;
+ vma = next;
+ break;
+ }
+ vma = next;
+ }
+
+ /*
+ * We need look no further than the maximum address a fragment
+ * could possibly have landed at. Also cast things to loff_t to
+ * prevent overflows and make comparisons vs. equal-width types.
+ */
+ size = PAGE_ALIGN(size);
+ while (vma && (loff_t)(vma->vm_end - addr) <= size) {
+ next = vma->vm_next;
+
+ /* finding a matching vma now does not alter retval */
+ if ((vma->vm_ops == &shm_vm_ops) &&
+ ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
+ (vma->vm_file == file))
+ do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+ vma = next;
+ }
+
+#else /* CONFIG_MMU */
+ /* under NOMMU conditions, the exact address to be destroyed must be
+ * given */
+ if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
+ do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+ retval = 0;
+ }
+
+#endif
+
+ up_write(&mm->mmap_sem);
+ return retval;
+}
+
+#ifdef CONFIG_PROC_FS
+static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
+{
+ struct user_namespace *user_ns = seq_user_ns(s);
+ struct shmid_kernel *shp = it;
+ unsigned long rss = 0, swp = 0;
+
+ shm_add_rss_swap(shp, &rss, &swp);
+
+#if BITS_PER_LONG <= 32
+#define SIZE_SPEC "%10lu"
+#else
+#define SIZE_SPEC "%21lu"
+#endif
+
+ seq_printf(s,
+ "%10d %10d %4o " SIZE_SPEC " %5u %5u "
+ "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
+ SIZE_SPEC " " SIZE_SPEC "\n",
+ shp->shm_perm.key,
+ shp->shm_perm.id,
+ shp->shm_perm.mode,
+ shp->shm_segsz,
+ shp->shm_cprid,
+ shp->shm_lprid,
+ shp->shm_nattch,
+ from_kuid_munged(user_ns, shp->shm_perm.uid),
+ from_kgid_munged(user_ns, shp->shm_perm.gid),
+ from_kuid_munged(user_ns, shp->shm_perm.cuid),
+ from_kgid_munged(user_ns, shp->shm_perm.cgid),
+ shp->shm_atim,
+ shp->shm_dtim,
+ shp->shm_ctim,
+ rss * PAGE_SIZE,
+ swp * PAGE_SIZE);
+
+ return 0;
+}
+#endif
diff --git a/ipc/syscall.c b/ipc/syscall.c
new file mode 100644
index 000000000..52429489c
--- /dev/null
+++ b/ipc/syscall.c
@@ -0,0 +1,99 @@
+/*
+ * sys_ipc() is the old de-multiplexer for the SysV IPC calls.
+ *
+ * This is really horribly ugly, and new architectures should just wire up
+ * the individual syscalls instead.
+ */
+#include <linux/unistd.h>
+
+#ifdef __ARCH_WANT_SYS_IPC
+#include <linux/errno.h>
+#include <linux/ipc.h>
+#include <linux/shm.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second,
+ unsigned long, third, void __user *, ptr, long, fifth)
+{
+ int version, ret;
+
+ version = call >> 16; /* hack for backward compatibility */
+ call &= 0xffff;
+
+ switch (call) {
+ case SEMOP:
+ return sys_semtimedop(first, (struct sembuf __user *)ptr,
+ second, NULL);
+ case SEMTIMEDOP:
+ return sys_semtimedop(first, (struct sembuf __user *)ptr,
+ second,
+ (const struct timespec __user *)fifth);
+
+ case SEMGET:
+ return sys_semget(first, second, third);
+ case SEMCTL: {
+ unsigned long arg;
+ if (!ptr)
+ return -EINVAL;
+ if (get_user(arg, (unsigned long __user *) ptr))
+ return -EFAULT;
+ return sys_semctl(first, second, third, arg);
+ }
+
+ case MSGSND:
+ return sys_msgsnd(first, (struct msgbuf __user *) ptr,
+ second, third);
+ case MSGRCV:
+ switch (version) {
+ case 0: {
+ struct ipc_kludge tmp;
+ if (!ptr)
+ return -EINVAL;
+
+ if (copy_from_user(&tmp,
+ (struct ipc_kludge __user *) ptr,
+ sizeof(tmp)))
+ return -EFAULT;
+ return sys_msgrcv(first, tmp.msgp, second,
+ tmp.msgtyp, third);
+ }
+ default:
+ return sys_msgrcv(first,
+ (struct msgbuf __user *) ptr,
+ second, fifth, third);
+ }
+ case MSGGET:
+ return sys_msgget((key_t) first, second);
+ case MSGCTL:
+ return sys_msgctl(first, second, (struct msqid_ds __user *)ptr);
+
+ case SHMAT:
+ switch (version) {
+ default: {
+ unsigned long raddr;
+ ret = do_shmat(first, (char __user *)ptr,
+ second, &raddr, SHMLBA);
+ if (ret)
+ return ret;
+ return put_user(raddr, (unsigned long __user *) third);
+ }
+ case 1:
+ /*
+ * This was the entry point for kernel-originating calls
+ * from iBCS2 in 2.2 days.
+ */
+ return -EINVAL;
+ }
+ case SHMDT:
+ return sys_shmdt((char __user *)ptr);
+ case SHMGET:
+ return sys_shmget(first, second, third);
+ case SHMCTL:
+ return sys_shmctl(first, second,
+ (struct shmid_ds __user *) ptr);
+ default:
+ return -ENOSYS;
+ }
+}
+#endif
diff --git a/ipc/util.c b/ipc/util.c
new file mode 100644
index 000000000..ff3323ef8
--- /dev/null
+++ b/ipc/util.c
@@ -0,0 +1,883 @@
+/*
+ * linux/ipc/util.c
+ * Copyright (C) 1992 Krishna Balasubramanian
+ *
+ * Sep 1997 - Call suser() last after "normal" permission checks so we
+ * get BSD style process accounting right.
+ * Occurs in several places in the IPC code.
+ * Chris Evans, <chris@ferret.lmh.ox.ac.uk>
+ * Nov 1999 - ipc helper functions, unified SMP locking
+ * Manfred Spraul <manfred@colorfullife.com>
+ * Oct 2002 - One lock per IPC id. RCU ipc_free for lock-free grow_ary().
+ * Mingming Cao <cmm@us.ibm.com>
+ * Mar 2006 - support for audit of ipc object properties
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
+ * Jun 2006 - namespaces ssupport
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ *
+ * General sysv ipc locking scheme:
+ * rcu_read_lock()
+ * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr
+ * tree.
+ * - perform initial checks (capabilities, auditing and permission,
+ * etc).
+ * - perform read-only operations, such as STAT, INFO commands.
+ * acquire the ipc lock (kern_ipc_perm.lock) through
+ * ipc_lock_object()
+ * - perform data updates, such as SET, RMID commands and
+ * mechanism-specific operations (semop/semtimedop,
+ * msgsnd/msgrcv, shmat/shmdt).
+ * drop the ipc lock, through ipc_unlock_object().
+ * rcu_read_unlock()
+ *
+ * The ids->rwsem must be taken when:
+ * - creating, removing and iterating the existing entries in ipc
+ * identifier sets.
+ * - iterating through files under /proc/sysvipc/
+ *
+ * Note that sems have a special fast path that avoids kern_ipc_perm.lock -
+ * see sem_lock().
+ */
+
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/init.h>
+#include <linux/msg.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/capability.h>
+#include <linux/highuid.h>
+#include <linux/security.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/audit.h>
+#include <linux/nsproxy.h>
+#include <linux/rwsem.h>
+#include <linux/memory.h>
+#include <linux/ipc_namespace.h>
+
+#include <asm/unistd.h>
+
+#include "util.h"
+
+struct ipc_proc_iface {
+ const char *path;
+ const char *header;
+ int ids;
+ int (*show)(struct seq_file *, void *);
+};
+
+/**
+ * ipc_init - initialise ipc subsystem
+ *
+ * The various sysv ipc resources (semaphores, messages and shared
+ * memory) are initialised.
+ *
+ * A callback routine is registered into the memory hotplug notifier
+ * chain: since msgmni scales to lowmem this callback routine will be
+ * called upon successful memory add / remove to recompute msmgni.
+ */
+static int __init ipc_init(void)
+{
+ sem_init();
+ msg_init();
+ shm_init();
+ return 0;
+}
+device_initcall(ipc_init);
+
+/**
+ * ipc_init_ids - initialise ipc identifiers
+ * @ids: ipc identifier set
+ *
+ * Set up the sequence range to use for the ipc identifier range (limited
+ * below IPCMNI) then initialise the ids idr.
+ */
+void ipc_init_ids(struct ipc_ids *ids)
+{
+ ids->in_use = 0;
+ ids->seq = 0;
+ ids->next_id = -1;
+ init_rwsem(&ids->rwsem);
+ idr_init(&ids->ipcs_idr);
+}
+
+#ifdef CONFIG_PROC_FS
+static const struct file_operations sysvipc_proc_fops;
+/**
+ * ipc_init_proc_interface - create a proc interface for sysipc types using a seq_file interface.
+ * @path: Path in procfs
+ * @header: Banner to be printed at the beginning of the file.
+ * @ids: ipc id table to iterate.
+ * @show: show routine.
+ */
+void __init ipc_init_proc_interface(const char *path, const char *header,
+ int ids, int (*show)(struct seq_file *, void *))
+{
+ struct proc_dir_entry *pde;
+ struct ipc_proc_iface *iface;
+
+ iface = kmalloc(sizeof(*iface), GFP_KERNEL);
+ if (!iface)
+ return;
+ iface->path = path;
+ iface->header = header;
+ iface->ids = ids;
+ iface->show = show;
+
+ pde = proc_create_data(path,
+ S_IRUGO, /* world readable */
+ NULL, /* parent dir */
+ &sysvipc_proc_fops,
+ iface);
+ if (!pde)
+ kfree(iface);
+}
+#endif
+
+/**
+ * ipc_findkey - find a key in an ipc identifier set
+ * @ids: ipc identifier set
+ * @key: key to find
+ *
+ * Returns the locked pointer to the ipc structure if found or NULL
+ * otherwise. If key is found ipc points to the owning ipc structure
+ *
+ * Called with ipc_ids.rwsem held.
+ */
+static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
+{
+ struct kern_ipc_perm *ipc;
+ int next_id;
+ int total;
+
+ for (total = 0, next_id = 0; total < ids->in_use; next_id++) {
+ ipc = idr_find(&ids->ipcs_idr, next_id);
+
+ if (ipc == NULL)
+ continue;
+
+ if (ipc->key != key) {
+ total++;
+ continue;
+ }
+
+ rcu_read_lock();
+ ipc_lock_object(ipc);
+ return ipc;
+ }
+
+ return NULL;
+}
+
+/**
+ * ipc_get_maxid - get the last assigned id
+ * @ids: ipc identifier set
+ *
+ * Called with ipc_ids.rwsem held.
+ */
+int ipc_get_maxid(struct ipc_ids *ids)
+{
+ struct kern_ipc_perm *ipc;
+ int max_id = -1;
+ int total, id;
+
+ if (ids->in_use == 0)
+ return -1;
+
+ if (ids->in_use == IPCMNI)
+ return IPCMNI - 1;
+
+ /* Look for the last assigned id */
+ total = 0;
+ for (id = 0; id < IPCMNI && total < ids->in_use; id++) {
+ ipc = idr_find(&ids->ipcs_idr, id);
+ if (ipc != NULL) {
+ max_id = id;
+ total++;
+ }
+ }
+ return max_id;
+}
+
+/**
+ * ipc_addid - add an ipc identifier
+ * @ids: ipc identifier set
+ * @new: new ipc permission set
+ * @size: limit for the number of used ids
+ *
+ * Add an entry 'new' to the ipc ids idr. The permissions object is
+ * initialised and the first free entry is set up and the id assigned
+ * is returned. The 'new' entry is returned in a locked state on success.
+ * On failure the entry is not locked and a negative err-code is returned.
+ *
+ * Called with writer ipc_ids.rwsem held.
+ */
+int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
+{
+ kuid_t euid;
+ kgid_t egid;
+ int id;
+ int next_id = ids->next_id;
+
+ if (size > IPCMNI)
+ size = IPCMNI;
+
+ if (ids->in_use >= size)
+ return -ENOSPC;
+
+ idr_preload(GFP_KERNEL);
+
+ spin_lock_init(&new->lock);
+ new->deleted = false;
+ rcu_read_lock();
+ spin_lock(&new->lock);
+
+ id = idr_alloc(&ids->ipcs_idr, new,
+ (next_id < 0) ? 0 : ipcid_to_idx(next_id), 0,
+ GFP_NOWAIT);
+ idr_preload_end();
+ if (id < 0) {
+ spin_unlock(&new->lock);
+ rcu_read_unlock();
+ return id;
+ }
+
+ ids->in_use++;
+
+ current_euid_egid(&euid, &egid);
+ new->cuid = new->uid = euid;
+ new->gid = new->cgid = egid;
+
+ if (next_id < 0) {
+ new->seq = ids->seq++;
+ if (ids->seq > IPCID_SEQ_MAX)
+ ids->seq = 0;
+ } else {
+ new->seq = ipcid_to_seqx(next_id);
+ ids->next_id = -1;
+ }
+
+ new->id = ipc_buildid(id, new->seq);
+ return id;
+}
+
+/**
+ * ipcget_new - create a new ipc object
+ * @ns: ipc namespace
+ * @ids: ipc identifier set
+ * @ops: the actual creation routine to call
+ * @params: its parameters
+ *
+ * This routine is called by sys_msgget, sys_semget() and sys_shmget()
+ * when the key is IPC_PRIVATE.
+ */
+static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
+ const struct ipc_ops *ops, struct ipc_params *params)
+{
+ int err;
+
+ down_write(&ids->rwsem);
+ err = ops->getnew(ns, params);
+ up_write(&ids->rwsem);
+ return err;
+}
+
+/**
+ * ipc_check_perms - check security and permissions for an ipc object
+ * @ns: ipc namespace
+ * @ipcp: ipc permission set
+ * @ops: the actual security routine to call
+ * @params: its parameters
+ *
+ * This routine is called by sys_msgget(), sys_semget() and sys_shmget()
+ * when the key is not IPC_PRIVATE and that key already exists in the
+ * ds IDR.
+ *
+ * On success, the ipc id is returned.
+ *
+ * It is called with ipc_ids.rwsem and ipcp->lock held.
+ */
+static int ipc_check_perms(struct ipc_namespace *ns,
+ struct kern_ipc_perm *ipcp,
+ const struct ipc_ops *ops,
+ struct ipc_params *params)
+{
+ int err;
+
+ if (ipcperms(ns, ipcp, params->flg))
+ err = -EACCES;
+ else {
+ err = ops->associate(ipcp, params->flg);
+ if (!err)
+ err = ipcp->id;
+ }
+
+ return err;
+}
+
+/**
+ * ipcget_public - get an ipc object or create a new one
+ * @ns: ipc namespace
+ * @ids: ipc identifier set
+ * @ops: the actual creation routine to call
+ * @params: its parameters
+ *
+ * This routine is called by sys_msgget, sys_semget() and sys_shmget()
+ * when the key is not IPC_PRIVATE.
+ * It adds a new entry if the key is not found and does some permission
+ * / security checkings if the key is found.
+ *
+ * On success, the ipc id is returned.
+ */
+static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
+ const struct ipc_ops *ops, struct ipc_params *params)
+{
+ struct kern_ipc_perm *ipcp;
+ int flg = params->flg;
+ int err;
+
+ /*
+ * Take the lock as a writer since we are potentially going to add
+ * a new entry + read locks are not "upgradable"
+ */
+ down_write(&ids->rwsem);
+ ipcp = ipc_findkey(ids, params->key);
+ if (ipcp == NULL) {
+ /* key not used */
+ if (!(flg & IPC_CREAT))
+ err = -ENOENT;
+ else
+ err = ops->getnew(ns, params);
+ } else {
+ /* ipc object has been locked by ipc_findkey() */
+
+ if (flg & IPC_CREAT && flg & IPC_EXCL)
+ err = -EEXIST;
+ else {
+ err = 0;
+ if (ops->more_checks)
+ err = ops->more_checks(ipcp, params);
+ if (!err)
+ /*
+ * ipc_check_perms returns the IPC id on
+ * success
+ */
+ err = ipc_check_perms(ns, ipcp, ops, params);
+ }
+ ipc_unlock(ipcp);
+ }
+ up_write(&ids->rwsem);
+
+ return err;
+}
+
+
+/**
+ * ipc_rmid - remove an ipc identifier
+ * @ids: ipc identifier set
+ * @ipcp: ipc perm structure containing the identifier to remove
+ *
+ * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
+ * before this function is called, and remain locked on the exit.
+ */
+void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
+{
+ int lid = ipcid_to_idx(ipcp->id);
+
+ idr_remove(&ids->ipcs_idr, lid);
+ ids->in_use--;
+ ipcp->deleted = true;
+}
+
+/**
+ * ipc_alloc - allocate ipc space
+ * @size: size desired
+ *
+ * Allocate memory from the appropriate pools and return a pointer to it.
+ * NULL is returned if the allocation fails
+ */
+void *ipc_alloc(int size)
+{
+ void *out;
+ if (size > PAGE_SIZE)
+ out = vmalloc(size);
+ else
+ out = kmalloc(size, GFP_KERNEL);
+ return out;
+}
+
+/**
+ * ipc_free - free ipc space
+ * @ptr: pointer returned by ipc_alloc
+ * @size: size of block
+ *
+ * Free a block created with ipc_alloc(). The caller must know the size
+ * used in the allocation call.
+ */
+void ipc_free(void *ptr, int size)
+{
+ if (size > PAGE_SIZE)
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
+
+/**
+ * ipc_rcu_alloc - allocate ipc and rcu space
+ * @size: size desired
+ *
+ * Allocate memory for the rcu header structure + the object.
+ * Returns the pointer to the object or NULL upon failure.
+ */
+void *ipc_rcu_alloc(int size)
+{
+ /*
+ * We prepend the allocation with the rcu struct
+ */
+ struct ipc_rcu *out = ipc_alloc(sizeof(struct ipc_rcu) + size);
+ if (unlikely(!out))
+ return NULL;
+ atomic_set(&out->refcount, 1);
+ return out + 1;
+}
+
+int ipc_rcu_getref(void *ptr)
+{
+ struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
+
+ return atomic_inc_not_zero(&p->refcount);
+}
+
+void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head))
+{
+ struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
+
+ if (!atomic_dec_and_test(&p->refcount))
+ return;
+
+ call_rcu(&p->rcu, func);
+}
+
+void ipc_rcu_free(struct rcu_head *head)
+{
+ struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+
+ if (is_vmalloc_addr(p))
+ vfree(p);
+ else
+ kfree(p);
+}
+
+/**
+ * ipcperms - check ipc permissions
+ * @ns: ipc namespace
+ * @ipcp: ipc permission set
+ * @flag: desired permission set
+ *
+ * Check user, group, other permissions for access
+ * to ipc resources. return 0 if allowed
+ *
+ * @flag will most probably be 0 or S_...UGO from <linux/stat.h>
+ */
+int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag)
+{
+ kuid_t euid = current_euid();
+ int requested_mode, granted_mode;
+
+ audit_ipc_obj(ipcp);
+ requested_mode = (flag >> 6) | (flag >> 3) | flag;
+ granted_mode = ipcp->mode;
+ if (uid_eq(euid, ipcp->cuid) ||
+ uid_eq(euid, ipcp->uid))
+ granted_mode >>= 6;
+ else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid))
+ granted_mode >>= 3;
+ /* is there some bit set in requested_mode but not in granted_mode? */
+ if ((requested_mode & ~granted_mode & 0007) &&
+ !ns_capable(ns->user_ns, CAP_IPC_OWNER))
+ return -1;
+
+ return security_ipc_permission(ipcp, flag);
+}
+
+/*
+ * Functions to convert between the kern_ipc_perm structure and the
+ * old/new ipc_perm structures
+ */
+
+/**
+ * kernel_to_ipc64_perm - convert kernel ipc permissions to user
+ * @in: kernel permissions
+ * @out: new style ipc permissions
+ *
+ * Turn the kernel object @in into a set of permissions descriptions
+ * for returning to userspace (@out).
+ */
+void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out)
+{
+ out->key = in->key;
+ out->uid = from_kuid_munged(current_user_ns(), in->uid);
+ out->gid = from_kgid_munged(current_user_ns(), in->gid);
+ out->cuid = from_kuid_munged(current_user_ns(), in->cuid);
+ out->cgid = from_kgid_munged(current_user_ns(), in->cgid);
+ out->mode = in->mode;
+ out->seq = in->seq;
+}
+
+/**
+ * ipc64_perm_to_ipc_perm - convert new ipc permissions to old
+ * @in: new style ipc permissions
+ * @out: old style ipc permissions
+ *
+ * Turn the new style permissions object @in into a compatibility
+ * object and store it into the @out pointer.
+ */
+void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
+{
+ out->key = in->key;
+ SET_UID(out->uid, in->uid);
+ SET_GID(out->gid, in->gid);
+ SET_UID(out->cuid, in->cuid);
+ SET_GID(out->cgid, in->cgid);
+ out->mode = in->mode;
+ out->seq = in->seq;
+}
+
+/**
+ * ipc_obtain_object
+ * @ids: ipc identifier set
+ * @id: ipc id to look for
+ *
+ * Look for an id in the ipc ids idr and return associated ipc object.
+ *
+ * Call inside the RCU critical section.
+ * The ipc object is *not* locked on exit.
+ */
+struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id)
+{
+ struct kern_ipc_perm *out;
+ int lid = ipcid_to_idx(id);
+
+ out = idr_find(&ids->ipcs_idr, lid);
+ if (!out)
+ return ERR_PTR(-EINVAL);
+
+ return out;
+}
+
+/**
+ * ipc_lock - lock an ipc structure without rwsem held
+ * @ids: ipc identifier set
+ * @id: ipc id to look for
+ *
+ * Look for an id in the ipc ids idr and lock the associated ipc object.
+ *
+ * The ipc object is locked on successful exit.
+ */
+struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
+{
+ struct kern_ipc_perm *out;
+
+ rcu_read_lock();
+ out = ipc_obtain_object(ids, id);
+ if (IS_ERR(out))
+ goto err1;
+
+ spin_lock(&out->lock);
+
+ /* ipc_rmid() may have already freed the ID while ipc_lock
+ * was spinning: here verify that the structure is still valid
+ */
+ if (ipc_valid_object(out))
+ return out;
+
+ spin_unlock(&out->lock);
+ out = ERR_PTR(-EINVAL);
+err1:
+ rcu_read_unlock();
+ return out;
+}
+
+/**
+ * ipc_obtain_object_check
+ * @ids: ipc identifier set
+ * @id: ipc id to look for
+ *
+ * Similar to ipc_obtain_object() but also checks
+ * the ipc object reference counter.
+ *
+ * Call inside the RCU critical section.
+ * The ipc object is *not* locked on exit.
+ */
+struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id)
+{
+ struct kern_ipc_perm *out = ipc_obtain_object(ids, id);
+
+ if (IS_ERR(out))
+ goto out;
+
+ if (ipc_checkid(out, id))
+ return ERR_PTR(-EIDRM);
+out:
+ return out;
+}
+
+/**
+ * ipcget - Common sys_*get() code
+ * @ns: namespace
+ * @ids: ipc identifier set
+ * @ops: operations to be called on ipc object creation, permission checks
+ * and further checks
+ * @params: the parameters needed by the previous operations.
+ *
+ * Common routine called by sys_msgget(), sys_semget() and sys_shmget().
+ */
+int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
+ const struct ipc_ops *ops, struct ipc_params *params)
+{
+ if (params->key == IPC_PRIVATE)
+ return ipcget_new(ns, ids, ops, params);
+ else
+ return ipcget_public(ns, ids, ops, params);
+}
+
+/**
+ * ipc_update_perm - update the permissions of an ipc object
+ * @in: the permission given as input.
+ * @out: the permission of the ipc to set.
+ */
+int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
+{
+ kuid_t uid = make_kuid(current_user_ns(), in->uid);
+ kgid_t gid = make_kgid(current_user_ns(), in->gid);
+ if (!uid_valid(uid) || !gid_valid(gid))
+ return -EINVAL;
+
+ out->uid = uid;
+ out->gid = gid;
+ out->mode = (out->mode & ~S_IRWXUGO)
+ | (in->mode & S_IRWXUGO);
+
+ return 0;
+}
+
+/**
+ * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd
+ * @ns: ipc namespace
+ * @ids: the table of ids where to look for the ipc
+ * @id: the id of the ipc to retrieve
+ * @cmd: the cmd to check
+ * @perm: the permission to set
+ * @extra_perm: one extra permission parameter used by msq
+ *
+ * This function does some common audit and permissions check for some IPC_XXX
+ * cmd and is called from semctl_down, shmctl_down and msgctl_down.
+ * It must be called without any lock held and
+ * - retrieves the ipc with the given id in the given table.
+ * - performs some audit and permission check, depending on the given cmd
+ * - returns a pointer to the ipc object or otherwise, the corresponding error.
+ *
+ * Call holding the both the rwsem and the rcu read lock.
+ */
+struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
+ struct ipc_ids *ids, int id, int cmd,
+ struct ipc64_perm *perm, int extra_perm)
+{
+ kuid_t euid;
+ int err = -EPERM;
+ struct kern_ipc_perm *ipcp;
+
+ ipcp = ipc_obtain_object_check(ids, id);
+ if (IS_ERR(ipcp)) {
+ err = PTR_ERR(ipcp);
+ goto err;
+ }
+
+ audit_ipc_obj(ipcp);
+ if (cmd == IPC_SET)
+ audit_ipc_set_perm(extra_perm, perm->uid,
+ perm->gid, perm->mode);
+
+ euid = current_euid();
+ if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid) ||
+ ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return ipcp; /* successful lookup */
+err:
+ return ERR_PTR(err);
+}
+
+#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
+
+
+/**
+ * ipc_parse_version - ipc call version
+ * @cmd: pointer to command
+ *
+ * Return IPC_64 for new style IPC and IPC_OLD for old style IPC.
+ * The @cmd value is turned from an encoding command and version into
+ * just the command code.
+ */
+int ipc_parse_version(int *cmd)
+{
+ if (*cmd & IPC_64) {
+ *cmd ^= IPC_64;
+ return IPC_64;
+ } else {
+ return IPC_OLD;
+ }
+}
+
+#endif /* CONFIG_ARCH_WANT_IPC_PARSE_VERSION */
+
+#ifdef CONFIG_PROC_FS
+struct ipc_proc_iter {
+ struct ipc_namespace *ns;
+ struct ipc_proc_iface *iface;
+};
+
+/*
+ * This routine locks the ipc structure found at least at position pos.
+ */
+static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
+ loff_t *new_pos)
+{
+ struct kern_ipc_perm *ipc;
+ int total, id;
+
+ total = 0;
+ for (id = 0; id < pos && total < ids->in_use; id++) {
+ ipc = idr_find(&ids->ipcs_idr, id);
+ if (ipc != NULL)
+ total++;
+ }
+
+ if (total >= ids->in_use)
+ return NULL;
+
+ for (; pos < IPCMNI; pos++) {
+ ipc = idr_find(&ids->ipcs_idr, pos);
+ if (ipc != NULL) {
+ *new_pos = pos + 1;
+ rcu_read_lock();
+ ipc_lock_object(ipc);
+ return ipc;
+ }
+ }
+
+ /* Out of range - return NULL to terminate iteration */
+ return NULL;
+}
+
+static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos)
+{
+ struct ipc_proc_iter *iter = s->private;
+ struct ipc_proc_iface *iface = iter->iface;
+ struct kern_ipc_perm *ipc = it;
+
+ /* If we had an ipc id locked before, unlock it */
+ if (ipc && ipc != SEQ_START_TOKEN)
+ ipc_unlock(ipc);
+
+ return sysvipc_find_ipc(&iter->ns->ids[iface->ids], *pos, pos);
+}
+
+/*
+ * File positions: pos 0 -> header, pos n -> ipc id = n - 1.
+ * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START.
+ */
+static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos)
+{
+ struct ipc_proc_iter *iter = s->private;
+ struct ipc_proc_iface *iface = iter->iface;
+ struct ipc_ids *ids;
+
+ ids = &iter->ns->ids[iface->ids];
+
+ /*
+ * Take the lock - this will be released by the corresponding
+ * call to stop().
+ */
+ down_read(&ids->rwsem);
+
+ /* pos < 0 is invalid */
+ if (*pos < 0)
+ return NULL;
+
+ /* pos == 0 means header */
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ /* Find the (pos-1)th ipc */
+ return sysvipc_find_ipc(ids, *pos - 1, pos);
+}
+
+static void sysvipc_proc_stop(struct seq_file *s, void *it)
+{
+ struct kern_ipc_perm *ipc = it;
+ struct ipc_proc_iter *iter = s->private;
+ struct ipc_proc_iface *iface = iter->iface;
+ struct ipc_ids *ids;
+
+ /* If we had a locked structure, release it */
+ if (ipc && ipc != SEQ_START_TOKEN)
+ ipc_unlock(ipc);
+
+ ids = &iter->ns->ids[iface->ids];
+ /* Release the lock we took in start() */
+ up_read(&ids->rwsem);
+}
+
+static int sysvipc_proc_show(struct seq_file *s, void *it)
+{
+ struct ipc_proc_iter *iter = s->private;
+ struct ipc_proc_iface *iface = iter->iface;
+
+ if (it == SEQ_START_TOKEN) {
+ seq_puts(s, iface->header);
+ return 0;
+ }
+
+ return iface->show(s, it);
+}
+
+static const struct seq_operations sysvipc_proc_seqops = {
+ .start = sysvipc_proc_start,
+ .stop = sysvipc_proc_stop,
+ .next = sysvipc_proc_next,
+ .show = sysvipc_proc_show,
+};
+
+static int sysvipc_proc_open(struct inode *inode, struct file *file)
+{
+ struct ipc_proc_iter *iter;
+
+ iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter));
+ if (!iter)
+ return -ENOMEM;
+
+ iter->iface = PDE_DATA(inode);
+ iter->ns = get_ipc_ns(current->nsproxy->ipc_ns);
+
+ return 0;
+}
+
+static int sysvipc_proc_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct ipc_proc_iter *iter = seq->private;
+ put_ipc_ns(iter->ns);
+ return seq_release_private(inode, file);
+}
+
+static const struct file_operations sysvipc_proc_fops = {
+ .open = sysvipc_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = sysvipc_proc_release,
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/ipc/util.h b/ipc/util.h
new file mode 100644
index 000000000..1a5a0fcd0
--- /dev/null
+++ b/ipc/util.h
@@ -0,0 +1,207 @@
+/*
+ * linux/ipc/util.h
+ * Copyright (C) 1999 Christoph Rohland
+ *
+ * ipc helper functions (c) 1999 Manfred Spraul <manfred@colorfullife.com>
+ * namespaces support. 2006 OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ */
+
+#ifndef _IPC_UTIL_H
+#define _IPC_UTIL_H
+
+#include <linux/unistd.h>
+#include <linux/err.h>
+
+#define SEQ_MULTIPLIER (IPCMNI)
+
+void sem_init(void);
+void msg_init(void);
+void shm_init(void);
+
+struct ipc_namespace;
+
+#ifdef CONFIG_POSIX_MQUEUE
+extern void mq_clear_sbinfo(struct ipc_namespace *ns);
+extern void mq_put_mnt(struct ipc_namespace *ns);
+#else
+static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
+static inline void mq_put_mnt(struct ipc_namespace *ns) { }
+#endif
+
+#ifdef CONFIG_SYSVIPC
+void sem_init_ns(struct ipc_namespace *ns);
+void msg_init_ns(struct ipc_namespace *ns);
+void shm_init_ns(struct ipc_namespace *ns);
+
+void sem_exit_ns(struct ipc_namespace *ns);
+void msg_exit_ns(struct ipc_namespace *ns);
+void shm_exit_ns(struct ipc_namespace *ns);
+#else
+static inline void sem_init_ns(struct ipc_namespace *ns) { }
+static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline void shm_init_ns(struct ipc_namespace *ns) { }
+
+static inline void sem_exit_ns(struct ipc_namespace *ns) { }
+static inline void msg_exit_ns(struct ipc_namespace *ns) { }
+static inline void shm_exit_ns(struct ipc_namespace *ns) { }
+#endif
+
+struct ipc_rcu {
+ struct rcu_head rcu;
+ atomic_t refcount;
+} ____cacheline_aligned_in_smp;
+
+#define ipc_rcu_to_struct(p) ((void *)(p+1))
+
+/*
+ * Structure that holds the parameters needed by the ipc operations
+ * (see after)
+ */
+struct ipc_params {
+ key_t key;
+ int flg;
+ union {
+ size_t size; /* for shared memories */
+ int nsems; /* for semaphores */
+ } u; /* holds the getnew() specific param */
+};
+
+/*
+ * Structure that holds some ipc operations. This structure is used to unify
+ * the calls to sys_msgget(), sys_semget(), sys_shmget()
+ * . routine to call to create a new ipc object. Can be one of newque,
+ * newary, newseg
+ * . routine to call to check permissions for a new ipc object.
+ * Can be one of security_msg_associate, security_sem_associate,
+ * security_shm_associate
+ * . routine to call for an extra check if needed
+ */
+struct ipc_ops {
+ int (*getnew)(struct ipc_namespace *, struct ipc_params *);
+ int (*associate)(struct kern_ipc_perm *, int);
+ int (*more_checks)(struct kern_ipc_perm *, struct ipc_params *);
+};
+
+struct seq_file;
+struct ipc_ids;
+
+void ipc_init_ids(struct ipc_ids *);
+#ifdef CONFIG_PROC_FS
+void __init ipc_init_proc_interface(const char *path, const char *header,
+ int ids, int (*show)(struct seq_file *, void *));
+#else
+#define ipc_init_proc_interface(path, header, ids, show) do {} while (0)
+#endif
+
+#define IPC_SEM_IDS 0
+#define IPC_MSG_IDS 1
+#define IPC_SHM_IDS 2
+
+#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
+#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
+#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX)
+
+/* must be called with ids->rwsem acquired for writing */
+int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
+
+/* must be called with ids->rwsem acquired for reading */
+int ipc_get_maxid(struct ipc_ids *);
+
+/* must be called with both locks acquired. */
+void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);
+
+/* must be called with ipcp locked */
+int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
+
+/* for rare, potentially huge allocations.
+ * both function can sleep
+ */
+void *ipc_alloc(int size);
+void ipc_free(void *ptr, int size);
+
+/*
+ * For allocation that need to be freed by RCU.
+ * Objects are reference counted, they start with reference count 1.
+ * getref increases the refcount, the putref call that reduces the recount
+ * to 0 schedules the rcu destruction. Caller must guarantee locking.
+ */
+void *ipc_rcu_alloc(int size);
+int ipc_rcu_getref(void *ptr);
+void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head));
+void ipc_rcu_free(struct rcu_head *head);
+
+struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
+struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id);
+
+void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
+void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
+int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
+struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
+ struct ipc_ids *ids, int id, int cmd,
+ struct ipc64_perm *perm, int extra_perm);
+
+#ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
+/* On IA-64, we always use the "64-bit version" of the IPC structures. */
+# define ipc_parse_version(cmd) IPC_64
+#else
+int ipc_parse_version(int *cmd);
+#endif
+
+extern void free_msg(struct msg_msg *msg);
+extern struct msg_msg *load_msg(const void __user *src, size_t len);
+extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
+extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);
+
+extern void recompute_msgmni(struct ipc_namespace *);
+
+static inline int ipc_buildid(int id, int seq)
+{
+ return SEQ_MULTIPLIER * seq + id;
+}
+
+static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid)
+{
+ return uid / SEQ_MULTIPLIER != ipcp->seq;
+}
+
+static inline void ipc_lock_object(struct kern_ipc_perm *perm)
+{
+ spin_lock(&perm->lock);
+}
+
+static inline void ipc_unlock_object(struct kern_ipc_perm *perm)
+{
+ spin_unlock(&perm->lock);
+}
+
+static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm)
+{
+ assert_spin_locked(&perm->lock);
+}
+
+static inline void ipc_unlock(struct kern_ipc_perm *perm)
+{
+ ipc_unlock_object(perm);
+ rcu_read_unlock();
+}
+
+/*
+ * ipc_valid_object() - helper to sort out IPC_RMID races for codepaths
+ * where the respective ipc_ids.rwsem is not being held down.
+ * Checks whether the ipc object is still around or if it's gone already, as
+ * ipc_rmid() may have already freed the ID while the ipc lock was spinning.
+ * Needs to be called with kern_ipc_perm.lock held -- exception made for one
+ * checkpoint case at sys_semtimedop() as noted in code commentary.
+ */
+static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
+{
+ return !perm->deleted;
+}
+
+struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
+int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
+ const struct ipc_ops *ops, struct ipc_params *params);
+void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
+ void (*free)(struct ipc_namespace *, struct kern_ipc_perm *));
+#endif