Initial import

author: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
committer: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
commit: 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree: 5e910f0e82173f4ef4f51111366a3f1299037a7b /ipc
53 files changed, 22114 insertions, 0 deletions
diff --git a/ipc/Makefile b/ipc/Makefile
new file mode 100644
index 000000000..68ec4167d
--- /dev/null
+++ b/ipc/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux ipc.
+#
+
+obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
+obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o
+obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o
+obj_mq-$(CONFIG_COMPAT) += compat_mq.o
+obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
+obj-$(CONFIG_IPC_NS) += namespace.o
+obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o
+obj-$(CONFIG_KDBUS) += kdbus/
diff --git a/ipc/compat.c b/ipc/compat.c
new file mode 100644
index 000000000..9b3c85f8a
--- /dev/null
+++ b/ipc/compat.c
@@ -0,0 +1,757 @@
+/*
+ * 32 bit compatibility code for System V IPC
+ *
+ * Copyright (C) 1997,1998	Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997		David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1999		Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 2000		VA Linux Co
+ * Copyright (C) 2000		Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 2000           Hewlett-Packard Co.
+ * Copyright (C) 2000           David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000           Gerhard Tonn (ton@de.ibm.com)
+ * Copyright (C) 2000-2002      Andi Kleen, SuSE Labs (x86-64 port)
+ * Copyright (C) 2000		Silicon Graphics, Inc.
+ * Copyright (C) 2001		IBM
+ * Copyright (C) 2004		IBM Deutschland Entwicklung GmbH, IBM Corporation
+ * Copyright (C) 2004		Arnd Bergmann (arnd@arndb.de)
+ *
+ * This code is collected from the versions for sparc64, mips64, s390x, ia64,
+ * ppc64 and x86_64, all of which are based on the original sparc64 version
+ * by Jakub Jelinek.
+ *
+ */
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/highuid.h>
+#include <linux/init.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/syscalls.h>
+#include <linux/ptrace.h>
+
+#include <linux/mutex.h>
+#include <linux/uaccess.h>
+
+#include "util.h"
+
+struct compat_msgbuf {
+	compat_long_t mtype;
+	char mtext[1];
+};
+
+struct compat_ipc_perm {
+	key_t key;
+	__compat_uid_t uid;
+	__compat_gid_t gid;
+	__compat_uid_t cuid;
+	__compat_gid_t cgid;
+	compat_mode_t mode;
+	unsigned short seq;
+};
+
+struct compat_semid_ds {
+	struct compat_ipc_perm sem_perm;
+	compat_time_t sem_otime;
+	compat_time_t sem_ctime;
+	compat_uptr_t sem_base;
+	compat_uptr_t sem_pending;
+	compat_uptr_t sem_pending_last;
+	compat_uptr_t undo;
+	unsigned short sem_nsems;
+};
+
+struct compat_msqid_ds {
+	struct compat_ipc_perm msg_perm;
+	compat_uptr_t msg_first;
+	compat_uptr_t msg_last;
+	compat_time_t msg_stime;
+	compat_time_t msg_rtime;
+	compat_time_t msg_ctime;
+	compat_ulong_t msg_lcbytes;
+	compat_ulong_t msg_lqbytes;
+	unsigned short msg_cbytes;
+	unsigned short msg_qnum;
+	unsigned short msg_qbytes;
+	compat_ipc_pid_t msg_lspid;
+	compat_ipc_pid_t msg_lrpid;
+};
+
+struct compat_shmid_ds {
+	struct compat_ipc_perm shm_perm;
+	int shm_segsz;
+	compat_time_t shm_atime;
+	compat_time_t shm_dtime;
+	compat_time_t shm_ctime;
+	compat_ipc_pid_t shm_cpid;
+	compat_ipc_pid_t shm_lpid;
+	unsigned short shm_nattch;
+	unsigned short shm_unused;
+	compat_uptr_t shm_unused2;
+	compat_uptr_t shm_unused3;
+};
+
+struct compat_ipc_kludge {
+	compat_uptr_t msgp;
+	compat_long_t msgtyp;
+};
+
+struct compat_shminfo64 {
+	compat_ulong_t shmmax;
+	compat_ulong_t shmmin;
+	compat_ulong_t shmmni;
+	compat_ulong_t shmseg;
+	compat_ulong_t shmall;
+	compat_ulong_t __unused1;
+	compat_ulong_t __unused2;
+	compat_ulong_t __unused3;
+	compat_ulong_t __unused4;
+};
+
+struct compat_shm_info {
+	compat_int_t used_ids;
+	compat_ulong_t shm_tot, shm_rss, shm_swp;
+	compat_ulong_t swap_attempts, swap_successes;
+};
+
+static inline int compat_ipc_parse_version(int *cmd)
+{
+#ifdef	CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION
+	int version = *cmd & IPC_64;
+
+	/* this is tricky: architectures that have support for the old
+	 * ipc structures in 64 bit binaries need to have IPC_64 set
+	 * in cmd, the others need to have it cleared */
+#ifndef ipc_parse_version
+	*cmd |= IPC_64;
+#else
+	*cmd &= ~IPC_64;
+#endif
+	return version;
+#else
+	/* With the asm-generic APIs, we always use the 64-bit versions. */
+	return IPC_64;
+#endif
+}
+
+static inline int __get_compat_ipc64_perm(struct ipc64_perm *p64,
+					  struct compat_ipc64_perm __user *up64)
+{
+	int err;
+
+	err  = __get_user(p64->uid, &up64->uid);
+	err |= __get_user(p64->gid, &up64->gid);
+	err |= __get_user(p64->mode, &up64->mode);
+	return err;
+}
+
+static inline int __get_compat_ipc_perm(struct ipc64_perm *p,
+					struct compat_ipc_perm __user *up)
+{
+	int err;
+
+	err  = __get_user(p->uid, &up->uid);
+	err |= __get_user(p->gid, &up->gid);
+	err |= __get_user(p->mode, &up->mode);
+	return err;
+}
+
+static inline int __put_compat_ipc64_perm(struct ipc64_perm *p64,
+					  struct compat_ipc64_perm __user *up64)
+{
+	int err;
+
+	err  = __put_user(p64->key, &up64->key);
+	err |= __put_user(p64->uid, &up64->uid);
+	err |= __put_user(p64->gid, &up64->gid);
+	err |= __put_user(p64->cuid, &up64->cuid);
+	err |= __put_user(p64->cgid, &up64->cgid);
+	err |= __put_user(p64->mode, &up64->mode);
+	err |= __put_user(p64->seq, &up64->seq);
+	return err;
+}
+
+static inline int __put_compat_ipc_perm(struct ipc64_perm *p,
+					struct compat_ipc_perm __user *uip)
+{
+	int err;
+	__compat_uid_t u;
+	__compat_gid_t g;
+
+	err  = __put_user(p->key, &uip->key);
+	SET_UID(u, p->uid);
+	err |= __put_user(u, &uip->uid);
+	SET_GID(g, p->gid);
+	err |= __put_user(g, &uip->gid);
+	SET_UID(u, p->cuid);
+	err |= __put_user(u, &uip->cuid);
+	SET_GID(g, p->cgid);
+	err |= __put_user(g, &uip->cgid);
+	err |= __put_user(p->mode, &uip->mode);
+	err |= __put_user(p->seq, &uip->seq);
+	return err;
+}
+
+static inline int get_compat_semid64_ds(struct semid64_ds *sem64,
+					struct compat_semid64_ds __user *up64)
+{
+	if (!access_ok(VERIFY_READ, up64, sizeof(*up64)))
+		return -EFAULT;
+	return __get_compat_ipc64_perm(&sem64->sem_perm, &up64->sem_perm);
+}
+
+static inline int get_compat_semid_ds(struct semid64_ds *s,
+				      struct compat_semid_ds __user *up)
+{
+	if (!access_ok(VERIFY_READ, up, sizeof(*up)))
+		return -EFAULT;
+	return __get_compat_ipc_perm(&s->sem_perm, &up->sem_perm);
+}
+
+static inline int put_compat_semid64_ds(struct semid64_ds *sem64,
+					struct compat_semid64_ds __user *up64)
+{
+	int err;
+
+	if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
+		return -EFAULT;
+	err  = __put_compat_ipc64_perm(&sem64->sem_perm, &up64->sem_perm);
+	err |= __put_user(sem64->sem_otime, &up64->sem_otime);
+	err |= __put_user(sem64->sem_ctime, &up64->sem_ctime);
+	err |= __put_user(sem64->sem_nsems, &up64->sem_nsems);
+	return err;
+}
+
+static inline int put_compat_semid_ds(struct semid64_ds *s,
+				      struct compat_semid_ds __user *up)
+{
+	int err;
+
+	if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
+		return -EFAULT;
+	err  = __put_compat_ipc_perm(&s->sem_perm, &up->sem_perm);
+	err |= __put_user(s->sem_otime, &up->sem_otime);
+	err |= __put_user(s->sem_ctime, &up->sem_ctime);
+	err |= __put_user(s->sem_nsems, &up->sem_nsems);
+	return err;
+}
+
+static long do_compat_semctl(int first, int second, int third, u32 pad)
+{
+	unsigned long fourth;
+	int err, err2;
+	struct semid64_ds sem64;
+	struct semid64_ds __user *up64;
+	int version = compat_ipc_parse_version(&third);
+
+	memset(&sem64, 0, sizeof(sem64));
+
+	if ((third & (~IPC_64)) == SETVAL)
+#ifdef __BIG_ENDIAN
+		fourth = (unsigned long)pad << 32;
+#else
+		fourth = pad;
+#endif
+	else
+		fourth = (unsigned long)compat_ptr(pad);
+	switch (third & (~IPC_64)) {
+	case IPC_INFO:
+	case IPC_RMID:
+	case SEM_INFO:
+	case GETVAL:
+	case GETPID:
+	case GETNCNT:
+	case GETZCNT:
+	case GETALL:
+	case SETVAL:
+	case SETALL:
+		err = sys_semctl(first, second, third, fourth);
+		break;
+
+	case IPC_STAT:
+	case SEM_STAT:
+		up64 = compat_alloc_user_space(sizeof(sem64));
+		fourth = (unsigned long)up64;
+		err = sys_semctl(first, second, third, fourth);
+		if (err < 0)
+			break;
+		if (copy_from_user(&sem64, up64, sizeof(sem64)))
+			err2 = -EFAULT;
+		else if (version == IPC_64)
+			err2 = put_compat_semid64_ds(&sem64, compat_ptr(pad));
+		else
+			err2 = put_compat_semid_ds(&sem64, compat_ptr(pad));
+		if (err2)
+			err = -EFAULT;
+		break;
+
+	case IPC_SET:
+		if (version == IPC_64)
+			err = get_compat_semid64_ds(&sem64, compat_ptr(pad));
+		else
+			err = get_compat_semid_ds(&sem64, compat_ptr(pad));
+
+		up64 = compat_alloc_user_space(sizeof(sem64));
+		if (copy_to_user(up64, &sem64, sizeof(sem64)))
+			err = -EFAULT;
+		if (err)
+			break;
+
+		fourth = (unsigned long)up64;
+		err = sys_semctl(first, second, third, fourth);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
+{
+	struct compat_msgbuf __user *msgp = dest;
+	size_t msgsz;
+
+	if (put_user(msg->m_type, &msgp->mtype))
+		return -EFAULT;
+
+	msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz;
+	if (store_msg(msgp->mtext, msg, msgsz))
+		return -EFAULT;
+	return msgsz;
+}
+
+#ifndef COMPAT_SHMLBA
+#define COMPAT_SHMLBA	SHMLBA
+#endif
+
+#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
+COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second,
+	u32, third, compat_uptr_t, ptr, u32, fifth)
+{
+	int version;
+	u32 pad;
+
+	version = call >> 16; /* hack for backward compatibility */
+	call &= 0xffff;
+
+	switch (call) {
+	case SEMOP:
+		/* struct sembuf is the same on 32 and 64bit :)) */
+		return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
+	case SEMTIMEDOP:
+		return compat_sys_semtimedop(first, compat_ptr(ptr), second,
+						compat_ptr(fifth));
+	case SEMGET:
+		return sys_semget(first, second, third);
+	case SEMCTL:
+		if (!ptr)
+			return -EINVAL;
+		if (get_user(pad, (u32 __user *) compat_ptr(ptr)))
+			return -EFAULT;
+		return do_compat_semctl(first, second, third, pad);
+
+	case MSGSND: {
+		struct compat_msgbuf __user *up = compat_ptr(ptr);
+		compat_long_t type;
+
+		if (first < 0 || second < 0)
+			return -EINVAL;
+
+		if (get_user(type, &up->mtype))
+			return -EFAULT;
+
+		return do_msgsnd(first, type, up->mtext, second, third);
+	}
+	case MSGRCV: {
+		void __user *uptr = compat_ptr(ptr);
+
+		if (first < 0 || second < 0)
+			return -EINVAL;
+
+		if (!version) {
+			struct compat_ipc_kludge ipck;
+			if (!uptr)
+				return -EINVAL;
+			if (copy_from_user(&ipck, uptr, sizeof(ipck)))
+				return -EFAULT;
+			uptr = compat_ptr(ipck.msgp);
+			fifth = ipck.msgtyp;
+		}
+		return do_msgrcv(first, uptr, second, (s32)fifth, third,
+				 compat_do_msg_fill);
+	}
+	case MSGGET:
+		return sys_msgget(first, second);
+	case MSGCTL:
+		return compat_sys_msgctl(first, second, compat_ptr(ptr));
+
+	case SHMAT: {
+		int err;
+		unsigned long raddr;
+
+		if (version == 1)
+			return -EINVAL;
+		err = do_shmat(first, compat_ptr(ptr), second, &raddr,
+			       COMPAT_SHMLBA);
+		if (err < 0)
+			return err;
+		return put_user(raddr, (compat_ulong_t *)compat_ptr(third));
+	}
+	case SHMDT:
+		return sys_shmdt(compat_ptr(ptr));
+	case SHMGET:
+		return sys_shmget(first, (unsigned)second, third);
+	case SHMCTL:
+		return compat_sys_shmctl(first, second, compat_ptr(ptr));
+	}
+
+	return -ENOSYS;
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
+{
+	return do_compat_semctl(semid, semnum, cmd, arg);
+}
+
+COMPAT_SYSCALL_DEFINE4(msgsnd, int, msqid, compat_uptr_t, msgp,
+		       compat_ssize_t, msgsz, int, msgflg)
+{
+	struct compat_msgbuf __user *up = compat_ptr(msgp);
+	compat_long_t mtype;
+
+	if (get_user(mtype, &up->mtype))
+		return -EFAULT;
+	return do_msgsnd(msqid, mtype, up->mtext, (ssize_t)msgsz, msgflg);
+}
+
+COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
+		       compat_ssize_t, msgsz, compat_long_t, msgtyp, int, msgflg)
+{
+	return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, (long)msgtyp,
+			 msgflg, compat_do_msg_fill);
+}
+
+static inline int get_compat_msqid64(struct msqid64_ds *m64,
+				     struct compat_msqid64_ds __user *up64)
+{
+	int err;
+
+	if (!access_ok(VERIFY_READ, up64, sizeof(*up64)))
+		return -EFAULT;
+	err  = __get_compat_ipc64_perm(&m64->msg_perm, &up64->msg_perm);
+	err |= __get_user(m64->msg_qbytes, &up64->msg_qbytes);
+	return err;
+}
+
+static inline int get_compat_msqid(struct msqid64_ds *m,
+				   struct compat_msqid_ds __user *up)
+{
+	int err;
+
+	if (!access_ok(VERIFY_READ, up, sizeof(*up)))
+		return -EFAULT;
+	err  = __get_compat_ipc_perm(&m->msg_perm, &up->msg_perm);
+	err |= __get_user(m->msg_qbytes, &up->msg_qbytes);
+	return err;
+}
+
+static inline int put_compat_msqid64_ds(struct msqid64_ds *m64,
+				 struct compat_msqid64_ds __user *up64)
+{
+	int err;
+
+	if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
+		return -EFAULT;
+	err  = __put_compat_ipc64_perm(&m64->msg_perm, &up64->msg_perm);
+	err |= __put_user(m64->msg_stime, &up64->msg_stime);
+	err |= __put_user(m64->msg_rtime, &up64->msg_rtime);
+	err |= __put_user(m64->msg_ctime, &up64->msg_ctime);
+	err |= __put_user(m64->msg_cbytes, &up64->msg_cbytes);
+	err |= __put_user(m64->msg_qnum, &up64->msg_qnum);
+	err |= __put_user(m64->msg_qbytes, &up64->msg_qbytes);
+	err |= __put_user(m64->msg_lspid, &up64->msg_lspid);
+	err |= __put_user(m64->msg_lrpid, &up64->msg_lrpid);
+	return err;
+}
+
+static inline int put_compat_msqid_ds(struct msqid64_ds *m,
+				      struct compat_msqid_ds __user *up)
+{
+	int err;
+
+	if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
+		return -EFAULT;
+	err  = __put_compat_ipc_perm(&m->msg_perm, &up->msg_perm);
+	err |= __put_user(m->msg_stime, &up->msg_stime);
+	err |= __put_user(m->msg_rtime, &up->msg_rtime);
+	err |= __put_user(m->msg_ctime, &up->msg_ctime);
+	err |= __put_user(m->msg_cbytes, &up->msg_cbytes);
+	err |= __put_user(m->msg_qnum, &up->msg_qnum);
+	err |= __put_user(m->msg_qbytes, &up->msg_qbytes);
+	err |= __put_user(m->msg_lspid, &up->msg_lspid);
+	err |= __put_user(m->msg_lrpid, &up->msg_lrpid);
+	return err;
+}
+
+COMPAT_SYSCALL_DEFINE3(msgctl, int, first, int, second, void __user *, uptr)
+{
+	int err, err2;
+	struct msqid64_ds m64;
+	int version = compat_ipc_parse_version(&second);
+	void __user *p;
+
+	memset(&m64, 0, sizeof(m64));
+
+	switch (second & (~IPC_64)) {
+	case IPC_INFO:
+	case IPC_RMID:
+	case MSG_INFO:
+		err = sys_msgctl(first, second, uptr);
+		break;
+
+	case IPC_SET:
+		if (version == IPC_64)
+			err = get_compat_msqid64(&m64, uptr);
+		else
+			err = get_compat_msqid(&m64, uptr);
+
+		if (err)
+			break;
+		p = compat_alloc_user_space(sizeof(m64));
+		if (copy_to_user(p, &m64, sizeof(m64)))
+			err = -EFAULT;
+		else
+			err = sys_msgctl(first, second, p);
+		break;
+
+	case IPC_STAT:
+	case MSG_STAT:
+		p = compat_alloc_user_space(sizeof(m64));
+		err = sys_msgctl(first, second, p);
+		if (err < 0)
+			break;
+		if (copy_from_user(&m64, p, sizeof(m64)))
+			err2 = -EFAULT;
+		else if (version == IPC_64)
+			err2 = put_compat_msqid64_ds(&m64, uptr);
+		else
+			err2 = put_compat_msqid_ds(&m64, uptr);
+		if (err2)
+			err = -EFAULT;
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+COMPAT_SYSCALL_DEFINE3(shmat, int, shmid, compat_uptr_t, shmaddr, int, shmflg)
+{
+	unsigned long ret;
+	long err;
+
+	err = do_shmat(shmid, compat_ptr(shmaddr), shmflg, &ret, COMPAT_SHMLBA);
+	if (err)
+		return err;
+	force_successful_syscall_return();
+	return (long)ret;
+}
+
+static inline int get_compat_shmid64_ds(struct shmid64_ds *sem64,
+					struct compat_shmid64_ds __user *up64)
+{
+	if (!access_ok(VERIFY_READ, up64, sizeof(*up64)))
+		return -EFAULT;
+	return __get_compat_ipc64_perm(&sem64->shm_perm, &up64->shm_perm);
+}
+
+static inline int get_compat_shmid_ds(struct shmid64_ds *s,
+				      struct compat_shmid_ds __user *up)
+{
+	if (!access_ok(VERIFY_READ, up, sizeof(*up)))
+		return -EFAULT;
+	return __get_compat_ipc_perm(&s->shm_perm, &up->shm_perm);
+}
+
+static inline int put_compat_shmid64_ds(struct shmid64_ds *sem64,
+					struct compat_shmid64_ds __user *up64)
+{
+	int err;
+
+	if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
+		return -EFAULT;
+	err  = __put_compat_ipc64_perm(&sem64->shm_perm, &up64->shm_perm);
+	err |= __put_user(sem64->shm_atime, &up64->shm_atime);
+	err |= __put_user(sem64->shm_dtime, &up64->shm_dtime);
+	err |= __put_user(sem64->shm_ctime, &up64->shm_ctime);
+	err |= __put_user(sem64->shm_segsz, &up64->shm_segsz);
+	err |= __put_user(sem64->shm_nattch, &up64->shm_nattch);
+	err |= __put_user(sem64->shm_cpid, &up64->shm_cpid);
+	err |= __put_user(sem64->shm_lpid, &up64->shm_lpid);
+	return err;
+}
+
+static inline int put_compat_shmid_ds(struct shmid64_ds *s,
+				      struct compat_shmid_ds __user *up)
+{
+	int err;
+
+	if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
+		return -EFAULT;
+	err  = __put_compat_ipc_perm(&s->shm_perm, &up->shm_perm);
+	err |= __put_user(s->shm_atime, &up->shm_atime);
+	err |= __put_user(s->shm_dtime, &up->shm_dtime);
+	err |= __put_user(s->shm_ctime, &up->shm_ctime);
+	err |= __put_user(s->shm_segsz, &up->shm_segsz);
+	err |= __put_user(s->shm_nattch, &up->shm_nattch);
+	err |= __put_user(s->shm_cpid, &up->shm_cpid);
+	err |= __put_user(s->shm_lpid, &up->shm_lpid);
+	return err;
+}
+
+static inline int put_compat_shminfo64(struct shminfo64 *smi,
+				       struct compat_shminfo64 __user *up64)
+{
+	int err;
+
+	if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
+		return -EFAULT;
+	if (smi->shmmax > INT_MAX)
+		smi->shmmax = INT_MAX;
+	err  = __put_user(smi->shmmax, &up64->shmmax);
+	err |= __put_user(smi->shmmin, &up64->shmmin);
+	err |= __put_user(smi->shmmni, &up64->shmmni);
+	err |= __put_user(smi->shmseg, &up64->shmseg);
+	err |= __put_user(smi->shmall, &up64->shmall);
+	return err;
+}
+
+static inline int put_compat_shminfo(struct shminfo64 *smi,
+				     struct shminfo __user *up)
+{
+	int err;
+
+	if (!access_ok(VERIFY_WRITE, up, sizeof(*up)))
+		return -EFAULT;
+	if (smi->shmmax > INT_MAX)
+		smi->shmmax = INT_MAX;
+	err  = __put_user(smi->shmmax, &up->shmmax);
+	err |= __put_user(smi->shmmin, &up->shmmin);
+	err |= __put_user(smi->shmmni, &up->shmmni);
+	err |= __put_user(smi->shmseg, &up->shmseg);
+	err |= __put_user(smi->shmall, &up->shmall);
+	return err;
+}
+
+static inline int put_compat_shm_info(struct shm_info __user *ip,
+				      struct compat_shm_info __user *uip)
+{
+	int err;
+	struct shm_info si;
+
+	if (!access_ok(VERIFY_WRITE, uip, sizeof(*uip)) ||
+	    copy_from_user(&si, ip, sizeof(si)))
+		return -EFAULT;
+	err  = __put_user(si.used_ids, &uip->used_ids);
+	err |= __put_user(si.shm_tot, &uip->shm_tot);
+	err |= __put_user(si.shm_rss, &uip->shm_rss);
+	err |= __put_user(si.shm_swp, &uip->shm_swp);
+	err |= __put_user(si.swap_attempts, &uip->swap_attempts);
+	err |= __put_user(si.swap_successes, &uip->swap_successes);
+	return err;
+}
+
+COMPAT_SYSCALL_DEFINE3(shmctl, int, first, int, second, void __user *, uptr)
+{
+	void __user *p;
+	struct shmid64_ds sem64;
+	struct shminfo64 smi;
+	int err, err2;
+	int version = compat_ipc_parse_version(&second);
+
+	memset(&sem64, 0, sizeof(sem64));
+
+	switch (second & (~IPC_64)) {
+	case IPC_RMID:
+	case SHM_LOCK:
+	case SHM_UNLOCK:
+		err = sys_shmctl(first, second, uptr);
+		break;
+
+	case IPC_INFO:
+		p = compat_alloc_user_space(sizeof(smi));
+		err = sys_shmctl(first, second, p);
+		if (err < 0)
+			break;
+		if (copy_from_user(&smi, p, sizeof(smi)))
+			err2 = -EFAULT;
+		else if (version == IPC_64)
+			err2 = put_compat_shminfo64(&smi, uptr);
+		else
+			err2 = put_compat_shminfo(&smi, uptr);
+		if (err2)
+			err = -EFAULT;
+		break;
+
+
+	case IPC_SET:
+		if (version == IPC_64)
+			err = get_compat_shmid64_ds(&sem64, uptr);
+		else
+			err = get_compat_shmid_ds(&sem64, uptr);
+
+		if (err)
+			break;
+		p = compat_alloc_user_space(sizeof(sem64));
+		if (copy_to_user(p, &sem64, sizeof(sem64)))
+			err = -EFAULT;
+		else
+			err = sys_shmctl(first, second, p);
+		break;
+
+	case IPC_STAT:
+	case SHM_STAT:
+		p = compat_alloc_user_space(sizeof(sem64));
+		err = sys_shmctl(first, second, p);
+		if (err < 0)
+			break;
+		if (copy_from_user(&sem64, p, sizeof(sem64)))
+			err2 = -EFAULT;
+		else if (version == IPC_64)
+			err2 = put_compat_shmid64_ds(&sem64, uptr);
+		else
+			err2 = put_compat_shmid_ds(&sem64, uptr);
+		if (err2)
+			err = -EFAULT;
+		break;
+
+	case SHM_INFO:
+		p = compat_alloc_user_space(sizeof(struct shm_info));
+		err = sys_shmctl(first, second, p);
+		if (err < 0)
+			break;
+		err2 = put_compat_shm_info(p, uptr);
+		if (err2)
+			err = -EFAULT;
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+COMPAT_SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsems,
+		       unsigned, nsops,
+		       const struct compat_timespec __user *, timeout)
+{
+	struct timespec __user *ts64;
+	if (compat_convert_timespec(&ts64, timeout))
+		return -EFAULT;
+	return sys_semtimedop(semid, tsems, nsops, ts64);
+}
diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c
new file mode 100644
index 000000000..ef6f91cc4
--- /dev/null
+++ b/ipc/compat_mq.c
@@ -0,0 +1,138 @@
+/*
+ *  ipc/compat_mq.c
+ *    32 bit emulation for POSIX message queue system calls
+ *
+ *    Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *    Author: Arnd Bergmann <arnd@arndb.de>
+ */
+
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mqueue.h>
+#include <linux/syscalls.h>
+
+#include <linux/uaccess.h>
+
+struct compat_mq_attr {
+	compat_long_t mq_flags;      /* message queue flags		     */
+	compat_long_t mq_maxmsg;     /* maximum number of messages	     */
+	compat_long_t mq_msgsize;    /* maximum message size		     */
+	compat_long_t mq_curmsgs;    /* number of messages currently queued  */
+	compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
+};
+
+static inline int get_compat_mq_attr(struct mq_attr *attr,
+			const struct compat_mq_attr __user *uattr)
+{
+	if (!access_ok(VERIFY_READ, uattr, sizeof *uattr))
+		return -EFAULT;
+
+	return __get_user(attr->mq_flags, &uattr->mq_flags)
+		| __get_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+		| __get_user(attr->mq_msgsize, &uattr->mq_msgsize)
+		| __get_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+static inline int put_compat_mq_attr(const struct mq_attr *attr,
+			struct compat_mq_attr __user *uattr)
+{
+	if (clear_user(uattr, sizeof *uattr))
+		return -EFAULT;
+
+	return __put_user(attr->mq_flags, &uattr->mq_flags)
+		| __put_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+		| __put_user(attr->mq_msgsize, &uattr->mq_msgsize)
+		| __put_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,
+		       int, oflag, compat_mode_t, mode,
+		       struct compat_mq_attr __user *, u_attr)
+{
+	void __user *p = NULL;
+	if (u_attr && oflag & O_CREAT) {
+		struct mq_attr attr;
+
+		memset(&attr, 0, sizeof(attr));
+
+		p = compat_alloc_user_space(sizeof(attr));
+		if (get_compat_mq_attr(&attr, u_attr) ||
+		    copy_to_user(p, &attr, sizeof(attr)))
+			return -EFAULT;
+	}
+	return sys_mq_open(u_name, oflag, mode, p);
+}
+
+COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes,
+		       const char __user *, u_msg_ptr,
+		       compat_size_t, msg_len, unsigned int, msg_prio,
+		       const struct compat_timespec __user *, u_abs_timeout)
+{
+	struct timespec __user *u_ts;
+
+	if (compat_convert_timespec(&u_ts, u_abs_timeout))
+		return -EFAULT;
+
+	return sys_mq_timedsend(mqdes, u_msg_ptr, msg_len,
+			msg_prio, u_ts);
+}
+
+COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes,
+		       char __user *, u_msg_ptr,
+		       compat_size_t, msg_len, unsigned int __user *, u_msg_prio,
+		       const struct compat_timespec __user *, u_abs_timeout)
+{
+	struct timespec __user *u_ts;
+
+	if (compat_convert_timespec(&u_ts, u_abs_timeout))
+		return -EFAULT;
+
+	return sys_mq_timedreceive(mqdes, u_msg_ptr, msg_len,
+			u_msg_prio, u_ts);
+}
+
+COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
+		       const struct compat_sigevent __user *, u_notification)
+{
+	struct sigevent __user *p = NULL;
+	if (u_notification) {
+		struct sigevent n;
+		p = compat_alloc_user_space(sizeof(*p));
+		if (get_compat_sigevent(&n, u_notification))
+			return -EFAULT;
+		if (n.sigev_notify == SIGEV_THREAD)
+			n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);
+		if (copy_to_user(p, &n, sizeof(*p)))
+			return -EFAULT;
+	}
+	return sys_mq_notify(mqdes, p);
+}
+
+COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
+		       const struct compat_mq_attr __user *, u_mqstat,
+		       struct compat_mq_attr __user *, u_omqstat)
+{
+	struct mq_attr mqstat;
+	struct mq_attr __user *p = compat_alloc_user_space(2 * sizeof(*p));
+	long ret;
+
+	memset(&mqstat, 0, sizeof(mqstat));
+
+	if (u_mqstat) {
+		if (get_compat_mq_attr(&mqstat, u_mqstat) ||
+		    copy_to_user(p, &mqstat, sizeof(mqstat)))
+			return -EFAULT;
+	}
+	ret = sys_mq_getsetattr(mqdes,
+				u_mqstat ? p : NULL,
+				u_omqstat ? p + 1 : NULL);
+	if (ret)
+		return ret;
+	if (u_omqstat) {
+		if (copy_from_user(&mqstat, p + 1, sizeof(mqstat)) ||
+		    put_compat_mq_attr(&mqstat, u_omqstat))
+			return -EFAULT;
+	}
+	return 0;
+}
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
new file mode 100644
index 000000000..8ad93c29f
--- /dev/null
+++ b/ipc/ipc_sysctl.c
@@ -0,0 +1,224 @@
+/*
+ *  Copyright (C) 2007
+ *
+ *  Author: Eric Biederman <ebiederm@xmision.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/module.h>
+#include <linux/ipc.h>
+#include <linux/nsproxy.h>
+#include <linux/sysctl.h>
+#include <linux/uaccess.h>
+#include <linux/ipc_namespace.h>
+#include <linux/msg.h>
+#include "util.h"
+
+static void *get_ipc(struct ctl_table *table)
+{
+	char *which = table->data;
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+	return which;
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+static int proc_ipc_dointvec(struct ctl_table *table, int write,
+	void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table ipc_table;
+
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+	ipc_table.data = get_ipc(table);
+
+	return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
+}
+
+static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write,
+	void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table ipc_table;
+
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+	ipc_table.data = get_ipc(table);
+
+	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
+}
+
+static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
+	void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+	int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (err < 0)
+		return err;
+	if (ns->shm_rmid_forced)
+		shm_destroy_orphaned(ns);
+	return err;
+}
+
+static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write,
+	void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table ipc_table;
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+	ipc_table.data = get_ipc(table);
+
+	return proc_doulongvec_minmax(&ipc_table, write, buffer,
+					lenp, ppos);
+}
+
+static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
+	void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table ipc_table;
+	int dummy = 0;
+
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+	ipc_table.data = &dummy;
+
+	if (write)
+		pr_info_once("writing to auto_msgmni has no effect");
+
+	return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
+}
+
+#else
+#define proc_ipc_doulongvec_minmax NULL
+#define proc_ipc_dointvec	   NULL
+#define proc_ipc_dointvec_minmax   NULL
+#define proc_ipc_dointvec_minmax_orphans   NULL
+#define proc_ipc_auto_msgmni	   NULL
+#endif
+
+static int zero;
+static int one = 1;
+static int int_max = INT_MAX;
+
+static struct ctl_table ipc_kern_table[] = {
+	{
+		.procname	= "shmmax",
+		.data		= &init_ipc_ns.shm_ctlmax,
+		.maxlen		= sizeof(init_ipc_ns.shm_ctlmax),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_doulongvec_minmax,
+	},
+	{
+		.procname	= "shmall",
+		.data		= &init_ipc_ns.shm_ctlall,
+		.maxlen		= sizeof(init_ipc_ns.shm_ctlall),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_doulongvec_minmax,
+	},
+	{
+		.procname	= "shmmni",
+		.data		= &init_ipc_ns.shm_ctlmni,
+		.maxlen		= sizeof(init_ipc_ns.shm_ctlmni),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_dointvec,
+	},
+	{
+		.procname	= "shm_rmid_forced",
+		.data		= &init_ipc_ns.shm_rmid_forced,
+		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname	= "msgmax",
+		.data		= &init_ipc_ns.msg_ctlmax,
+		.maxlen		= sizeof(init_ipc_ns.msg_ctlmax),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max,
+	},
+	{
+		.procname	= "msgmni",
+		.data		= &init_ipc_ns.msg_ctlmni,
+		.maxlen		= sizeof(init_ipc_ns.msg_ctlmni),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max,
+	},
+	{
+		.procname	= "auto_msgmni",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_auto_msgmni,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname	=  "msgmnb",
+		.data		= &init_ipc_ns.msg_ctlmnb,
+		.maxlen		= sizeof(init_ipc_ns.msg_ctlmnb),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max,
+	},
+	{
+		.procname	= "sem",
+		.data		= &init_ipc_ns.sem_ctls,
+		.maxlen		= 4*sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_dointvec,
+	},
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	{
+		.procname	= "sem_next_id",
+		.data		= &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
+		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max,
+	},
+	{
+		.procname	= "msg_next_id",
+		.data		= &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
+		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max,
+	},
+	{
+		.procname	= "shm_next_id",
+		.data		= &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
+		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
+		.mode		= 0644,
+		.proc_handler	= proc_ipc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &int_max,
+	},
+#endif
+	{}
+};
+
+static struct ctl_table ipc_root_table[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= ipc_kern_table,
+	},
+	{}
+};
+
+static int __init ipc_sysctl_init(void)
+{
+	register_sysctl_table(ipc_root_table);
+	return 0;
+}
+
+device_initcall(ipc_sysctl_init);
diff --git a/ipc/kdbus/Makefile b/ipc/kdbus/Makefile
new file mode 100644
index 000000000..66663a124
--- /dev/null
+++ b/ipc/kdbus/Makefile
@@ -0,0 +1,33 @@
+#
+# By setting KDBUS_EXT=2, the kdbus module will be built as kdbus2.ko, and
+# KBUILD_MODNAME=kdbus2. This has the effect that all exported objects have
+# different names than usually (kdbus2fs, /sys/fs/kdbus2/) and you can run
+# your test-infrastructure against the kdbus2.ko, while running your system
+# on kdbus.ko.
+#
+# To just build the module, use:
+#     make KDBUS_EXT=2 M=ipc/kdbus
+#
+
+kdbus$(KDBUS_EXT)-y := \
+	bus.o \
+	connection.o \
+	endpoint.o \
+	fs.o \
+	handle.o \
+	item.o \
+	main.o \
+	match.o \
+	message.o \
+	metadata.o \
+	names.o \
+	node.o \
+	notify.o \
+	domain.o \
+	policy.o \
+	pool.o \
+	reply.o \
+	queue.o \
+	util.o
+
+obj-$(CONFIG_KDBUS) += kdbus$(KDBUS_EXT).o
diff --git a/ipc/kdbus/bus.c b/ipc/kdbus/bus.c
new file mode 100644
index 000000000..a67f825bd
--- /dev/null
+++ b/ipc/kdbus/bus.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "notify.h"
+#include "connection.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "match.h"
+#include "message.h"
+#include "metadata.h"
+#include "names.h"
+#include "policy.h"
+#include "util.h"
+
+static void kdbus_bus_free(struct kdbus_node *node)
+{
+	struct kdbus_bus *bus = container_of(node, struct kdbus_bus, node);
+
+	WARN_ON(!list_empty(&bus->monitors_list));
+	WARN_ON(!hash_empty(bus->conn_hash));
+
+	kdbus_notify_free(bus);
+
+	kdbus_user_unref(bus->creator);
+	kdbus_name_registry_free(bus->name_registry);
+	kdbus_domain_unref(bus->domain);
+	kdbus_policy_db_clear(&bus->policy_db);
+	kdbus_meta_proc_unref(bus->creator_meta);
+	kfree(bus);
+}
+
+static void kdbus_bus_release(struct kdbus_node *node, bool was_active)
+{
+	struct kdbus_bus *bus = container_of(node, struct kdbus_bus, node);
+
+	if (was_active)
+		atomic_dec(&bus->creator->buses);
+}
+
+static struct kdbus_bus *kdbus_bus_new(struct kdbus_domain *domain,
+				       const char *name,
+				       struct kdbus_bloom_parameter *bloom,
+				       const u64 *pattach_owner,
+				       u64 flags, kuid_t uid, kgid_t gid)
+{
+	struct kdbus_bus *b;
+	u64 attach_owner;
+	int ret;
+
+	if (bloom->size < 8 || bloom->size > KDBUS_BUS_BLOOM_MAX_SIZE ||
+	    !KDBUS_IS_ALIGNED8(bloom->size) || bloom->n_hash < 1)
+		return ERR_PTR(-EINVAL);
+
+	ret = kdbus_sanitize_attach_flags(pattach_owner ? *pattach_owner : 0,
+					  &attach_owner);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	ret = kdbus_verify_uid_prefix(name, domain->user_namespace, uid);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	b = kzalloc(sizeof(*b), GFP_KERNEL);
+	if (!b)
+		return ERR_PTR(-ENOMEM);
+
+	kdbus_node_init(&b->node, KDBUS_NODE_BUS);
+
+	b->node.free_cb = kdbus_bus_free;
+	b->node.release_cb = kdbus_bus_release;
+	b->node.uid = uid;
+	b->node.gid = gid;
+	b->node.mode = S_IRUSR | S_IXUSR;
+
+	if (flags & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD))
+		b->node.mode |= S_IRGRP | S_IXGRP;
+	if (flags & KDBUS_MAKE_ACCESS_WORLD)
+		b->node.mode |= S_IROTH | S_IXOTH;
+
+	b->id = atomic64_inc_return(&domain->last_id);
+	b->bus_flags = flags;
+	b->attach_flags_owner = attach_owner;
+	generate_random_uuid(b->id128);
+	b->bloom = *bloom;
+	b->domain = kdbus_domain_ref(domain);
+
+	kdbus_policy_db_init(&b->policy_db);
+
+	init_rwsem(&b->conn_rwlock);
+	hash_init(b->conn_hash);
+	INIT_LIST_HEAD(&b->monitors_list);
+
+	INIT_LIST_HEAD(&b->notify_list);
+	spin_lock_init(&b->notify_lock);
+	mutex_init(&b->notify_flush_lock);
+
+	ret = kdbus_node_link(&b->node, &domain->node, name);
+	if (ret < 0)
+		goto exit_unref;
+
+	/* cache the metadata/credentials of the creator */
+	b->creator_meta = kdbus_meta_proc_new();
+	if (IS_ERR(b->creator_meta)) {
+		ret = PTR_ERR(b->creator_meta);
+		b->creator_meta = NULL;
+		goto exit_unref;
+	}
+
+	ret = kdbus_meta_proc_collect(b->creator_meta,
+				      KDBUS_ATTACH_CREDS |
+				      KDBUS_ATTACH_PIDS |
+				      KDBUS_ATTACH_AUXGROUPS |
+				      KDBUS_ATTACH_TID_COMM |
+				      KDBUS_ATTACH_PID_COMM |
+				      KDBUS_ATTACH_EXE |
+				      KDBUS_ATTACH_CMDLINE |
+				      KDBUS_ATTACH_CGROUP |
+				      KDBUS_ATTACH_CAPS |
+				      KDBUS_ATTACH_SECLABEL |
+				      KDBUS_ATTACH_AUDIT);
+	if (ret < 0)
+		goto exit_unref;
+
+	b->name_registry = kdbus_name_registry_new();
+	if (IS_ERR(b->name_registry)) {
+		ret = PTR_ERR(b->name_registry);
+		b->name_registry = NULL;
+		goto exit_unref;
+	}
+
+	/*
+	 * Bus-limits of the creator are accounted on its real UID, just like
+	 * all other per-user limits.
+	 */
+	b->creator = kdbus_user_lookup(domain, current_uid());
+	if (IS_ERR(b->creator)) {
+		ret = PTR_ERR(b->creator);
+		b->creator = NULL;
+		goto exit_unref;
+	}
+
+	return b;
+
+exit_unref:
+	kdbus_node_deactivate(&b->node);
+	kdbus_node_unref(&b->node);
+	return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_bus_ref() - increase the reference counter of a kdbus_bus
+ * @bus:		The bus to reference
+ *
+ * Every user of a bus, except for its creator, must add a reference to the
+ * kdbus_bus using this function.
+ *
+ * Return: the bus itself
+ */
+struct kdbus_bus *kdbus_bus_ref(struct kdbus_bus *bus)
+{
+	if (bus)
+		kdbus_node_ref(&bus->node);
+	return bus;
+}
+
+/**
+ * kdbus_bus_unref() - decrease the reference counter of a kdbus_bus
+ * @bus:		The bus to unref
+ *
+ * Release a reference. If the reference count drops to 0, the bus will be
+ * freed.
+ *
+ * Return: NULL
+ */
+struct kdbus_bus *kdbus_bus_unref(struct kdbus_bus *bus)
+{
+	if (bus)
+		kdbus_node_unref(&bus->node);
+	return NULL;
+}
+
+/**
+ * kdbus_bus_find_conn_by_id() - find a connection with a given id
+ * @bus:		The bus to look for the connection
+ * @id:			The 64-bit connection id
+ *
+ * Looks up a connection with a given id. The returned connection
+ * is ref'ed, and needs to be unref'ed by the user. Returns NULL if
+ * the connection can't be found.
+ */
+struct kdbus_conn *kdbus_bus_find_conn_by_id(struct kdbus_bus *bus, u64 id)
+{
+	struct kdbus_conn *conn, *found = NULL;
+
+	down_read(&bus->conn_rwlock);
+	hash_for_each_possible(bus->conn_hash, conn, hentry, id)
+		if (conn->id == id) {
+			found = kdbus_conn_ref(conn);
+			break;
+		}
+	up_read(&bus->conn_rwlock);
+
+	return found;
+}
+
+/**
+ * kdbus_bus_broadcast() - send a message to all subscribed connections
+ * @bus:	The bus the connections are connected to
+ * @conn_src:	The source connection, may be %NULL for kernel notifications
+ * @staging:	Staging object containing the message to send
+ *
+ * Send message to all connections that are currently active on the bus.
+ * Connections must still have matches installed in order to let the message
+ * pass.
+ *
+ * The caller must hold the name-registry lock of @bus.
+ */
+void kdbus_bus_broadcast(struct kdbus_bus *bus,
+			 struct kdbus_conn *conn_src,
+			 struct kdbus_staging *staging)
+{
+	struct kdbus_conn *conn_dst;
+	unsigned int i;
+	int ret;
+
+	lockdep_assert_held(&bus->name_registry->rwlock);
+
+	/*
+	 * Make sure broadcast are queued on monitors before we send it out to
+	 * anyone else. Otherwise, connections might react to broadcasts before
+	 * the monitor gets the broadcast queued. In the worst case, the
+	 * monitor sees a reaction to the broadcast before the broadcast itself.
+	 * We don't give ordering guarantees across connections (and monitors
+	 * can re-construct order via sequence numbers), but we should at least
+	 * try to avoid re-ordering for monitors.
+	 */
+	kdbus_bus_eavesdrop(bus, conn_src, staging);
+
+	down_read(&bus->conn_rwlock);
+	hash_for_each(bus->conn_hash, i, conn_dst, hentry) {
+		if (!kdbus_conn_is_ordinary(conn_dst))
+			continue;
+
+		/*
+		 * Check if there is a match for the kmsg object in
+		 * the destination connection match db
+		 */
+		if (!kdbus_match_db_match_msg(conn_dst->match_db, conn_src,
+					      staging))
+			continue;
+
+		if (conn_src) {
+			/*
+			 * Anyone can send broadcasts, as they have no
+			 * destination. But a receiver needs TALK access to
+			 * the sender in order to receive broadcasts.
+			 */
+			if (!kdbus_conn_policy_talk(conn_dst, NULL, conn_src))
+				continue;
+		} else {
+			/*
+			 * Check if there is a policy db that prevents the
+			 * destination connection from receiving this kernel
+			 * notification
+			 */
+			if (!kdbus_conn_policy_see_notification(conn_dst, NULL,
+								staging->msg))
+				continue;
+		}
+
+		ret = kdbus_conn_entry_insert(conn_src, conn_dst, staging,
+					      NULL, NULL);
+		if (ret < 0)
+			kdbus_conn_lost_message(conn_dst);
+	}
+	up_read(&bus->conn_rwlock);
+}
+
+/**
+ * kdbus_bus_eavesdrop() - send a message to all subscribed monitors
+ * @bus:	The bus the monitors are connected to
+ * @conn_src:	The source connection, may be %NULL for kernel notifications
+ * @staging:	Staging object containing the message to send
+ *
+ * Send message to all monitors that are currently active on the bus. Monitors
+ * must still have matches installed in order to let the message pass.
+ *
+ * The caller must hold the name-registry lock of @bus.
+ */
+void kdbus_bus_eavesdrop(struct kdbus_bus *bus,
+			 struct kdbus_conn *conn_src,
+			 struct kdbus_staging *staging)
+{
+	struct kdbus_conn *conn_dst;
+	int ret;
+
+	/*
+	 * Monitor connections get all messages; ignore possible errors
+	 * when sending messages to monitor connections.
+	 */
+
+	lockdep_assert_held(&bus->name_registry->rwlock);
+
+	down_read(&bus->conn_rwlock);
+	list_for_each_entry(conn_dst, &bus->monitors_list, monitor_entry) {
+		ret = kdbus_conn_entry_insert(conn_src, conn_dst, staging,
+					      NULL, NULL);
+		if (ret < 0)
+			kdbus_conn_lost_message(conn_dst);
+	}
+	up_read(&bus->conn_rwlock);
+}
+
+/**
+ * kdbus_cmd_bus_make() - handle KDBUS_CMD_BUS_MAKE
+ * @domain:		domain to operate on
+ * @argp:		command payload
+ *
+ * Return: NULL or newly created bus on success, ERR_PTR on failure.
+ */
+struct kdbus_bus *kdbus_cmd_bus_make(struct kdbus_domain *domain,
+				     void __user *argp)
+{
+	struct kdbus_bus *bus = NULL;
+	struct kdbus_cmd *cmd;
+	struct kdbus_ep *ep = NULL;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_MAKE_NAME, .mandatory = true },
+		{ .type = KDBUS_ITEM_BLOOM_PARAMETER, .mandatory = true },
+		{ .type = KDBUS_ITEM_ATTACH_FLAGS_SEND },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE |
+				 KDBUS_MAKE_ACCESS_GROUP |
+				 KDBUS_MAKE_ACCESS_WORLD,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+
+	bus = kdbus_bus_new(domain,
+			    argv[1].item->str, &argv[2].item->bloom_parameter,
+			    argv[3].item ? argv[3].item->data64 : NULL,
+			    cmd->flags, current_euid(), current_egid());
+	if (IS_ERR(bus)) {
+		ret = PTR_ERR(bus);
+		bus = NULL;
+		goto exit;
+	}
+
+	if (atomic_inc_return(&bus->creator->buses) > KDBUS_USER_MAX_BUSES) {
+		atomic_dec(&bus->creator->buses);
+		ret = -EMFILE;
+		goto exit;
+	}
+
+	if (!kdbus_node_activate(&bus->node)) {
+		atomic_dec(&bus->creator->buses);
+		ret = -ESHUTDOWN;
+		goto exit;
+	}
+
+	ep = kdbus_ep_new(bus, "bus", cmd->flags, bus->node.uid, bus->node.gid,
+			  false);
+	if (IS_ERR(ep)) {
+		ret = PTR_ERR(ep);
+		ep = NULL;
+		goto exit;
+	}
+
+	if (!kdbus_node_activate(&ep->node)) {
+		ret = -ESHUTDOWN;
+		goto exit;
+	}
+
+	/*
+	 * Drop our own reference, effectively causing the endpoint to be
+	 * deactivated and released when the parent bus is.
+	 */
+	ep = kdbus_ep_unref(ep);
+
+exit:
+	ret = kdbus_args_clear(&args, ret);
+	if (ret < 0) {
+		if (ep) {
+			kdbus_node_deactivate(&ep->node);
+			kdbus_ep_unref(ep);
+		}
+		if (bus) {
+			kdbus_node_deactivate(&bus->node);
+			kdbus_bus_unref(bus);
+		}
+		return ERR_PTR(ret);
+	}
+	return bus;
+}
+
+/**
+ * kdbus_cmd_bus_creator_info() - handle KDBUS_CMD_BUS_CREATOR_INFO
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_bus_creator_info(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_cmd_info *cmd;
+	struct kdbus_bus *bus = conn->ep->bus;
+	struct kdbus_pool_slice *slice = NULL;
+	struct kdbus_item *meta_items = NULL;
+	struct kdbus_item_header item_hdr;
+	struct kdbus_info info = {};
+	size_t meta_size, name_len, cnt = 0;
+	struct kvec kvec[6];
+	u64 attach_flags, size = 0;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	ret = kdbus_sanitize_attach_flags(cmd->attach_flags, &attach_flags);
+	if (ret < 0)
+		goto exit;
+
+	attach_flags &= bus->attach_flags_owner;
+
+	ret = kdbus_meta_emit(bus->creator_meta, NULL, NULL, conn,
+			      attach_flags, &meta_items, &meta_size);
+	if (ret < 0)
+		goto exit;
+
+	name_len = strlen(bus->node.name) + 1;
+	info.id = bus->id;
+	info.flags = bus->bus_flags;
+	item_hdr.type = KDBUS_ITEM_MAKE_NAME;
+	item_hdr.size = KDBUS_ITEM_HEADER_SIZE + name_len;
+
+	kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &size);
+	kdbus_kvec_set(&kvec[cnt++], &item_hdr, sizeof(item_hdr), &size);
+	kdbus_kvec_set(&kvec[cnt++], bus->node.name, name_len, &size);
+	cnt += !!kdbus_kvec_pad(&kvec[cnt], &size);
+	if (meta_size > 0) {
+		kdbus_kvec_set(&kvec[cnt++], meta_items, meta_size, &size);
+		cnt += !!kdbus_kvec_pad(&kvec[cnt], &size);
+	}
+
+	info.size = size;
+
+	slice = kdbus_pool_slice_alloc(conn->pool, size, false);
+	if (IS_ERR(slice)) {
+		ret = PTR_ERR(slice);
+		slice = NULL;
+		goto exit;
+	}
+
+	ret = kdbus_pool_slice_copy_kvec(slice, 0, kvec, cnt, size);
+	if (ret < 0)
+		goto exit;
+
+	kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->info_size);
+
+	if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) ||
+	    kdbus_member_set_user(&cmd->info_size, argp,
+				  typeof(*cmd), info_size))
+		ret = -EFAULT;
+
+exit:
+	kdbus_pool_slice_release(slice);
+	kfree(meta_items);
+	return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/bus.h b/ipc/kdbus/bus.h
new file mode 100644
index 000000000..238986eff
--- /dev/null
+++ b/ipc/kdbus/bus.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_BUS_H
+#define __KDBUS_BUS_H
+
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <uapi/linux/kdbus.h>
+
+#include "metadata.h"
+#include "names.h"
+#include "node.h"
+#include "policy.h"
+
+struct kdbus_conn;
+struct kdbus_domain;
+struct kdbus_staging;
+struct kdbus_user;
+
+/**
+ * struct kdbus_bus - bus in a domain
+ * @node:		kdbus_node
+ * @id:			ID of this bus in the domain
+ * @bus_flags:		Simple pass-through flags from userspace to userspace
+ * @attach_flags_owner:	KDBUS_ATTACH_* flags of bus creator that other
+ *			connections can see or query
+ * @id128:		Unique random 128 bit ID of this bus
+ * @bloom:		Bloom parameters
+ * @domain:		Domain of this bus
+ * @creator:		Creator of the bus
+ * @creator_meta:	Meta information about the bus creator
+ * @policy_db:		Policy database for this bus
+ * @name_registry:	Name registry of this bus
+ * @conn_rwlock:	Read/Write lock for all lists of child connections
+ * @conn_hash:		Map of connection IDs
+ * @monitors_list:	Connections that monitor this bus
+ * @notify_list:	List of pending kernel-generated messages
+ * @notify_lock:	Notification list lock
+ * @notify_flush_lock:	Notification flushing lock
+ */
+struct kdbus_bus {
+	struct kdbus_node node;
+
+	/* static */
+	u64 id;
+	u64 bus_flags;
+	u64 attach_flags_owner;
+	u8 id128[16];
+	struct kdbus_bloom_parameter bloom;
+	struct kdbus_domain *domain;
+	struct kdbus_user *creator;
+	struct kdbus_meta_proc *creator_meta;
+
+	/* protected by own locks */
+	struct kdbus_policy_db policy_db;
+	struct kdbus_name_registry *name_registry;
+
+	/* protected by conn_rwlock */
+	struct rw_semaphore conn_rwlock;
+	DECLARE_HASHTABLE(conn_hash, 8);
+	struct list_head monitors_list;
+
+	/* protected by notify_lock */
+	struct list_head notify_list;
+	spinlock_t notify_lock;
+	struct mutex notify_flush_lock;
+};
+
+struct kdbus_bus *kdbus_bus_ref(struct kdbus_bus *bus);
+struct kdbus_bus *kdbus_bus_unref(struct kdbus_bus *bus);
+
+struct kdbus_conn *kdbus_bus_find_conn_by_id(struct kdbus_bus *bus, u64 id);
+void kdbus_bus_broadcast(struct kdbus_bus *bus,
+			 struct kdbus_conn *conn_src,
+			 struct kdbus_staging *staging);
+void kdbus_bus_eavesdrop(struct kdbus_bus *bus,
+			 struct kdbus_conn *conn_src,
+			 struct kdbus_staging *staging);
+
+struct kdbus_bus *kdbus_cmd_bus_make(struct kdbus_domain *domain,
+				     void __user *argp);
+int kdbus_cmd_bus_creator_info(struct kdbus_conn *conn, void __user *argp);
+
+#endif
diff --git a/ipc/kdbus/connection.c b/ipc/kdbus/connection.c
new file mode 100644
index 000000000..d94b417e0
--- /dev/null
+++ b/ipc/kdbus/connection.c
@@ -0,0 +1,2207 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/audit.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/hashtable.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/math64.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/path.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/shmem_fs.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "match.h"
+#include "message.h"
+#include "metadata.h"
+#include "names.h"
+#include "domain.h"
+#include "item.h"
+#include "notify.h"
+#include "policy.h"
+#include "pool.h"
+#include "reply.h"
+#include "util.h"
+#include "queue.h"
+
+#define KDBUS_CONN_ACTIVE_BIAS	(INT_MIN + 2)
+#define KDBUS_CONN_ACTIVE_NEW	(INT_MIN + 1)
+
+static struct kdbus_conn *kdbus_conn_new(struct kdbus_ep *ep, bool privileged,
+					 struct kdbus_cmd_hello *hello,
+					 const char *name,
+					 const struct kdbus_creds *creds,
+					 const struct kdbus_pids *pids,
+					 const char *seclabel,
+					 const char *conn_description)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	static struct lock_class_key __key;
+#endif
+	struct kdbus_pool_slice *slice = NULL;
+	struct kdbus_bus *bus = ep->bus;
+	struct kdbus_conn *conn;
+	u64 attach_flags_send;
+	u64 attach_flags_recv;
+	u64 items_size = 0;
+	bool is_policy_holder;
+	bool is_activator;
+	bool is_monitor;
+	struct kvec kvec;
+	int ret;
+
+	struct {
+		u64 size;
+		u64 type;
+		struct kdbus_bloom_parameter bloom;
+	} bloom_item;
+
+	is_monitor = hello->flags & KDBUS_HELLO_MONITOR;
+	is_activator = hello->flags & KDBUS_HELLO_ACTIVATOR;
+	is_policy_holder = hello->flags & KDBUS_HELLO_POLICY_HOLDER;
+
+	if (!hello->pool_size || !IS_ALIGNED(hello->pool_size, PAGE_SIZE))
+		return ERR_PTR(-EINVAL);
+	if (is_monitor + is_activator + is_policy_holder > 1)
+		return ERR_PTR(-EINVAL);
+	if (name && !is_activator && !is_policy_holder)
+		return ERR_PTR(-EINVAL);
+	if (!name && (is_activator || is_policy_holder))
+		return ERR_PTR(-EINVAL);
+	if (name && !kdbus_name_is_valid(name, true))
+		return ERR_PTR(-EINVAL);
+	if (is_monitor && ep->user)
+		return ERR_PTR(-EOPNOTSUPP);
+	if (!privileged && (is_activator || is_policy_holder || is_monitor))
+		return ERR_PTR(-EPERM);
+	if ((creds || pids || seclabel) && !privileged)
+		return ERR_PTR(-EPERM);
+
+	ret = kdbus_sanitize_attach_flags(hello->attach_flags_send,
+					  &attach_flags_send);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	ret = kdbus_sanitize_attach_flags(hello->attach_flags_recv,
+					  &attach_flags_recv);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+	if (!conn)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&conn->kref);
+	atomic_set(&conn->active, KDBUS_CONN_ACTIVE_NEW);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	lockdep_init_map(&conn->dep_map, "s_active", &__key, 0);
+#endif
+	mutex_init(&conn->lock);
+	INIT_LIST_HEAD(&conn->names_list);
+	INIT_LIST_HEAD(&conn->names_queue_list);
+	INIT_LIST_HEAD(&conn->reply_list);
+	atomic_set(&conn->name_count, 0);
+	atomic_set(&conn->request_count, 0);
+	atomic_set(&conn->lost_count, 0);
+	INIT_DELAYED_WORK(&conn->work, kdbus_reply_list_scan_work);
+	conn->cred = get_current_cred();
+	conn->pid = get_pid(task_pid(current));
+	get_fs_root(current->fs, &conn->root_path);
+	init_waitqueue_head(&conn->wait);
+	kdbus_queue_init(&conn->queue);
+	conn->privileged = privileged;
+	conn->ep = kdbus_ep_ref(ep);
+	conn->id = atomic64_inc_return(&bus->domain->last_id);
+	conn->flags = hello->flags;
+	atomic64_set(&conn->attach_flags_send, attach_flags_send);
+	atomic64_set(&conn->attach_flags_recv, attach_flags_recv);
+	INIT_LIST_HEAD(&conn->monitor_entry);
+
+	if (conn_description) {
+		conn->description = kstrdup(conn_description, GFP_KERNEL);
+		if (!conn->description) {
+			ret = -ENOMEM;
+			goto exit_unref;
+		}
+	}
+
+	conn->pool = kdbus_pool_new(conn->description, hello->pool_size);
+	if (IS_ERR(conn->pool)) {
+		ret = PTR_ERR(conn->pool);
+		conn->pool = NULL;
+		goto exit_unref;
+	}
+
+	conn->match_db = kdbus_match_db_new();
+	if (IS_ERR(conn->match_db)) {
+		ret = PTR_ERR(conn->match_db);
+		conn->match_db = NULL;
+		goto exit_unref;
+	}
+
+	/* return properties of this connection to the caller */
+	hello->bus_flags = bus->bus_flags;
+	hello->id = conn->id;
+
+	BUILD_BUG_ON(sizeof(bus->id128) != sizeof(hello->id128));
+	memcpy(hello->id128, bus->id128, sizeof(hello->id128));
+
+	/* privileged processes can impersonate somebody else */
+	if (creds || pids || seclabel) {
+		conn->meta_fake = kdbus_meta_fake_new();
+		if (IS_ERR(conn->meta_fake)) {
+			ret = PTR_ERR(conn->meta_fake);
+			conn->meta_fake = NULL;
+			goto exit_unref;
+		}
+
+		ret = kdbus_meta_fake_collect(conn->meta_fake,
+					      creds, pids, seclabel);
+		if (ret < 0)
+			goto exit_unref;
+	} else {
+		conn->meta_proc = kdbus_meta_proc_new();
+		if (IS_ERR(conn->meta_proc)) {
+			ret = PTR_ERR(conn->meta_proc);
+			conn->meta_proc = NULL;
+			goto exit_unref;
+		}
+
+		ret = kdbus_meta_proc_collect(conn->meta_proc,
+					      KDBUS_ATTACH_CREDS |
+					      KDBUS_ATTACH_PIDS |
+					      KDBUS_ATTACH_AUXGROUPS |
+					      KDBUS_ATTACH_TID_COMM |
+					      KDBUS_ATTACH_PID_COMM |
+					      KDBUS_ATTACH_EXE |
+					      KDBUS_ATTACH_CMDLINE |
+					      KDBUS_ATTACH_CGROUP |
+					      KDBUS_ATTACH_CAPS |
+					      KDBUS_ATTACH_SECLABEL |
+					      KDBUS_ATTACH_AUDIT);
+		if (ret < 0)
+			goto exit_unref;
+	}
+
+	/*
+	 * Account the connection against the current user (UID), or for
+	 * custom endpoints use the anonymous user assigned to the endpoint.
+	 * Note that limits are always accounted against the real UID, not
+	 * the effective UID (cred->user always points to the accounting of
+	 * cred->uid, not cred->euid).
+	 */
+	if (ep->user) {
+		conn->user = kdbus_user_ref(ep->user);
+	} else {
+		conn->user = kdbus_user_lookup(ep->bus->domain, current_uid());
+		if (IS_ERR(conn->user)) {
+			ret = PTR_ERR(conn->user);
+			conn->user = NULL;
+			goto exit_unref;
+		}
+	}
+
+	if (atomic_inc_return(&conn->user->connections) > KDBUS_USER_MAX_CONN) {
+		/* decremented by destructor as conn->user is valid */
+		ret = -EMFILE;
+		goto exit_unref;
+	}
+
+	bloom_item.size = sizeof(bloom_item);
+	bloom_item.type = KDBUS_ITEM_BLOOM_PARAMETER;
+	bloom_item.bloom = bus->bloom;
+	kdbus_kvec_set(&kvec, &bloom_item, bloom_item.size, &items_size);
+
+	slice = kdbus_pool_slice_alloc(conn->pool, items_size, false);
+	if (IS_ERR(slice)) {
+		ret = PTR_ERR(slice);
+		slice = NULL;
+		goto exit_unref;
+	}
+
+	ret = kdbus_pool_slice_copy_kvec(slice, 0, &kvec, 1, items_size);
+	if (ret < 0)
+		goto exit_unref;
+
+	kdbus_pool_slice_publish(slice, &hello->offset, &hello->items_size);
+	kdbus_pool_slice_release(slice);
+
+	return conn;
+
+exit_unref:
+	kdbus_pool_slice_release(slice);
+	kdbus_conn_unref(conn);
+	return ERR_PTR(ret);
+}
+
+static void __kdbus_conn_free(struct kref *kref)
+{
+	struct kdbus_conn *conn = container_of(kref, struct kdbus_conn, kref);
+
+	WARN_ON(kdbus_conn_active(conn));
+	WARN_ON(delayed_work_pending(&conn->work));
+	WARN_ON(!list_empty(&conn->queue.msg_list));
+	WARN_ON(!list_empty(&conn->names_list));
+	WARN_ON(!list_empty(&conn->names_queue_list));
+	WARN_ON(!list_empty(&conn->reply_list));
+
+	if (conn->user) {
+		atomic_dec(&conn->user->connections);
+		kdbus_user_unref(conn->user);
+	}
+
+	kdbus_meta_fake_free(conn->meta_fake);
+	kdbus_meta_proc_unref(conn->meta_proc);
+	kdbus_match_db_free(conn->match_db);
+	kdbus_pool_free(conn->pool);
+	kdbus_ep_unref(conn->ep);
+	path_put(&conn->root_path);
+	put_pid(conn->pid);
+	put_cred(conn->cred);
+	kfree(conn->description);
+	kfree(conn->quota);
+	kfree(conn);
+}
+
+/**
+ * kdbus_conn_ref() - take a connection reference
+ * @conn:		Connection, may be %NULL
+ *
+ * Return: the connection itself
+ */
+struct kdbus_conn *kdbus_conn_ref(struct kdbus_conn *conn)
+{
+	if (conn)
+		kref_get(&conn->kref);
+	return conn;
+}
+
+/**
+ * kdbus_conn_unref() - drop a connection reference
+ * @conn:		Connection (may be NULL)
+ *
+ * When the last reference is dropped, the connection's internal structure
+ * is freed.
+ *
+ * Return: NULL
+ */
+struct kdbus_conn *kdbus_conn_unref(struct kdbus_conn *conn)
+{
+	if (conn)
+		kref_put(&conn->kref, __kdbus_conn_free);
+	return NULL;
+}
+
+/**
+ * kdbus_conn_active() - connection is not disconnected
+ * @conn:		Connection to check
+ *
+ * Return true if the connection was not disconnected, yet. Note that a
+ * connection might be disconnected asynchronously, unless you hold the
+ * connection lock. If that's not suitable for you, see kdbus_conn_acquire() to
+ * suppress connection shutdown for a short period.
+ *
+ * Return: true if the connection is still active
+ */
+bool kdbus_conn_active(const struct kdbus_conn *conn)
+{
+	return atomic_read(&conn->active) >= 0;
+}
+
+/**
+ * kdbus_conn_acquire() - acquire an active connection reference
+ * @conn:		Connection
+ *
+ * Users can close a connection via KDBUS_BYEBYE (or by destroying the
+ * endpoint/bus/...) at any time. Whenever this happens, we should deny any
+ * user-visible action on this connection and signal ECONNRESET instead.
+ * To avoid testing for connection availability everytime you take the
+ * connection-lock, you can acquire a connection for short periods.
+ *
+ * By calling kdbus_conn_acquire(), you gain an "active reference" to the
+ * connection. You must also hold a regular reference at any time! As long as
+ * you hold the active-ref, the connection will not be shut down. However, if
+ * the connection was shut down, you can never acquire an active-ref again.
+ *
+ * kdbus_conn_disconnect() disables the connection and then waits for all active
+ * references to be dropped. It will also wake up any pending operation.
+ * However, you must not sleep for an indefinite period while holding an
+ * active-reference. Otherwise, kdbus_conn_disconnect() might stall. If you need
+ * to sleep for an indefinite period, either release the reference and try to
+ * acquire it again after waking up, or make kdbus_conn_disconnect() wake up
+ * your wait-queue.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_conn_acquire(struct kdbus_conn *conn)
+{
+	if (!atomic_inc_unless_negative(&conn->active))
+		return -ECONNRESET;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	rwsem_acquire_read(&conn->dep_map, 0, 1, _RET_IP_);
+#endif
+
+	return 0;
+}
+
+/**
+ * kdbus_conn_release() - release an active connection reference
+ * @conn:		Connection
+ *
+ * This releases an active reference that has been acquired via
+ * kdbus_conn_acquire(). If the connection was already disabled and this is the
+ * last active-ref that is dropped, the disconnect-waiter will be woken up and
+ * properly close the connection.
+ */
+void kdbus_conn_release(struct kdbus_conn *conn)
+{
+	int v;
+
+	if (!conn)
+		return;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	rwsem_release(&conn->dep_map, 1, _RET_IP_);
+#endif
+
+	v = atomic_dec_return(&conn->active);
+	if (v != KDBUS_CONN_ACTIVE_BIAS)
+		return;
+
+	wake_up_all(&conn->wait);
+}
+
+static int kdbus_conn_connect(struct kdbus_conn *conn, const char *name)
+{
+	struct kdbus_ep *ep = conn->ep;
+	struct kdbus_bus *bus = ep->bus;
+	int ret;
+
+	if (WARN_ON(atomic_read(&conn->active) != KDBUS_CONN_ACTIVE_NEW))
+		return -EALREADY;
+
+	/* make sure the ep-node is active while we add our connection */
+	if (!kdbus_node_acquire(&ep->node))
+		return -ESHUTDOWN;
+
+	/* lock order: domain -> bus -> ep -> names -> conn */
+	mutex_lock(&ep->lock);
+	down_write(&bus->conn_rwlock);
+
+	/* link into monitor list */
+	if (kdbus_conn_is_monitor(conn))
+		list_add_tail(&conn->monitor_entry, &bus->monitors_list);
+
+	/* link into bus and endpoint */
+	list_add_tail(&conn->ep_entry, &ep->conn_list);
+	hash_add(bus->conn_hash, &conn->hentry, conn->id);
+
+	/* enable lookups and acquire active ref */
+	atomic_set(&conn->active, 1);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	rwsem_acquire_read(&conn->dep_map, 0, 1, _RET_IP_);
+#endif
+
+	up_write(&bus->conn_rwlock);
+	mutex_unlock(&ep->lock);
+
+	kdbus_node_release(&ep->node);
+
+	/*
+	 * Notify subscribers about the new active connection, unless it is
+	 * a monitor. Monitors are invisible on the bus, can't be addressed
+	 * directly, and won't cause any notifications.
+	 */
+	if (!kdbus_conn_is_monitor(conn)) {
+		ret = kdbus_notify_id_change(bus, KDBUS_ITEM_ID_ADD,
+					     conn->id, conn->flags);
+		if (ret < 0)
+			goto exit_disconnect;
+	}
+
+	if (kdbus_conn_is_activator(conn)) {
+		u64 flags = KDBUS_NAME_ACTIVATOR;
+
+		if (WARN_ON(!name)) {
+			ret = -EINVAL;
+			goto exit_disconnect;
+		}
+
+		ret = kdbus_name_acquire(bus->name_registry, conn, name,
+					 flags, NULL);
+		if (ret < 0)
+			goto exit_disconnect;
+	}
+
+	kdbus_conn_release(conn);
+	kdbus_notify_flush(bus);
+	return 0;
+
+exit_disconnect:
+	kdbus_conn_release(conn);
+	kdbus_conn_disconnect(conn, false);
+	return ret;
+}
+
+/**
+ * kdbus_conn_disconnect() - disconnect a connection
+ * @conn:		The connection to disconnect
+ * @ensure_queue_empty:	Flag to indicate if the call should fail in
+ *			case the connection's message list is not
+ *			empty
+ *
+ * If @ensure_msg_list_empty is true, and the connection has pending messages,
+ * -EBUSY is returned.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+int kdbus_conn_disconnect(struct kdbus_conn *conn, bool ensure_queue_empty)
+{
+	struct kdbus_queue_entry *entry, *tmp;
+	struct kdbus_bus *bus = conn->ep->bus;
+	struct kdbus_reply *r, *r_tmp;
+	struct kdbus_conn *c;
+	int i, v;
+
+	mutex_lock(&conn->lock);
+	v = atomic_read(&conn->active);
+	if (v == KDBUS_CONN_ACTIVE_NEW) {
+		/* was never connected */
+		mutex_unlock(&conn->lock);
+		return 0;
+	}
+	if (v < 0) {
+		/* already dead */
+		mutex_unlock(&conn->lock);
+		return -ECONNRESET;
+	}
+	if (ensure_queue_empty && !list_empty(&conn->queue.msg_list)) {
+		/* still busy */
+		mutex_unlock(&conn->lock);
+		return -EBUSY;
+	}
+
+	atomic_add(KDBUS_CONN_ACTIVE_BIAS, &conn->active);
+	mutex_unlock(&conn->lock);
+
+	wake_up_interruptible(&conn->wait);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	rwsem_acquire(&conn->dep_map, 0, 0, _RET_IP_);
+	if (atomic_read(&conn->active) != KDBUS_CONN_ACTIVE_BIAS)
+		lock_contended(&conn->dep_map, _RET_IP_);
+#endif
+
+	wait_event(conn->wait,
+		   atomic_read(&conn->active) == KDBUS_CONN_ACTIVE_BIAS);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	lock_acquired(&conn->dep_map, _RET_IP_);
+	rwsem_release(&conn->dep_map, 1, _RET_IP_);
+#endif
+
+	cancel_delayed_work_sync(&conn->work);
+	kdbus_policy_remove_owner(&conn->ep->bus->policy_db, conn);
+
+	/* lock order: domain -> bus -> ep -> names -> conn */
+	mutex_lock(&conn->ep->lock);
+	down_write(&bus->conn_rwlock);
+
+	/* remove from bus and endpoint */
+	hash_del(&conn->hentry);
+	list_del(&conn->monitor_entry);
+	list_del(&conn->ep_entry);
+
+	up_write(&bus->conn_rwlock);
+	mutex_unlock(&conn->ep->lock);
+
+	/*
+	 * Remove all names associated with this connection; this possibly
+	 * moves queued messages back to the activator connection.
+	 */
+	kdbus_name_release_all(bus->name_registry, conn);
+
+	/* if we die while other connections wait for our reply, notify them */
+	mutex_lock(&conn->lock);
+	list_for_each_entry_safe(entry, tmp, &conn->queue.msg_list, entry) {
+		if (entry->reply)
+			kdbus_notify_reply_dead(bus,
+						entry->reply->reply_dst->id,
+						entry->reply->cookie);
+		kdbus_queue_entry_free(entry);
+	}
+
+	list_for_each_entry_safe(r, r_tmp, &conn->reply_list, entry)
+		kdbus_reply_unlink(r);
+	mutex_unlock(&conn->lock);
+
+	/* lock order: domain -> bus -> ep -> names -> conn */
+	down_read(&bus->conn_rwlock);
+	hash_for_each(bus->conn_hash, i, c, hentry) {
+		mutex_lock(&c->lock);
+		list_for_each_entry_safe(r, r_tmp, &c->reply_list, entry) {
+			if (r->reply_src != conn)
+				continue;
+
+			if (r->sync)
+				kdbus_sync_reply_wakeup(r, -EPIPE);
+			else
+				/* send a 'connection dead' notification */
+				kdbus_notify_reply_dead(bus, c->id, r->cookie);
+
+			kdbus_reply_unlink(r);
+		}
+		mutex_unlock(&c->lock);
+	}
+	up_read(&bus->conn_rwlock);
+
+	if (!kdbus_conn_is_monitor(conn))
+		kdbus_notify_id_change(bus, KDBUS_ITEM_ID_REMOVE,
+				       conn->id, conn->flags);
+
+	kdbus_notify_flush(bus);
+
+	return 0;
+}
+
+/**
+ * kdbus_conn_has_name() - check if a connection owns a name
+ * @conn:		Connection
+ * @name:		Well-know name to check for
+ *
+ * The caller must hold the registry lock of conn->ep->bus.
+ *
+ * Return: true if the name is currently owned by the connection
+ */
+bool kdbus_conn_has_name(struct kdbus_conn *conn, const char *name)
+{
+	struct kdbus_name_entry *e;
+
+	lockdep_assert_held(&conn->ep->bus->name_registry->rwlock);
+
+	list_for_each_entry(e, &conn->names_list, conn_entry)
+		if (strcmp(e->name, name) == 0)
+			return true;
+
+	return false;
+}
+
+struct kdbus_quota {
+	u32 memory;
+	u16 msgs;
+	u8 fds;
+};
+
+/**
+ * kdbus_conn_quota_inc() - increase quota accounting
+ * @c:		connection owning the quota tracking
+ * @u:		user to account for (or NULL for kernel accounting)
+ * @memory:	size of memory to account for
+ * @fds:	number of FDs to account for
+ *
+ * This call manages the quotas on resource @c. That is, it's used if other
+ * users want to use the resources of connection @c, which so far only concerns
+ * the receive queue of the destination.
+ *
+ * This increases the quota-accounting for user @u by @memory bytes and @fds
+ * file descriptors. If the user has already reached the quota limits, this call
+ * will not do any accounting but return a negative error code indicating the
+ * failure.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_conn_quota_inc(struct kdbus_conn *c, struct kdbus_user *u,
+			 size_t memory, size_t fds)
+{
+	struct kdbus_quota *quota;
+	size_t available, accounted;
+	unsigned int id;
+
+	/*
+	 * Pool Layout:
+	 * 50% of a pool is always owned by the connection. It is reserved for
+	 * kernel queries, handling received messages and other tasks that are
+	 * under control of the pool owner. The other 50% of the pool are used
+	 * as incoming queue.
+	 * As we optionally support user-space based policies, we need fair
+	 * allocation schemes. Furthermore, resource utilization should be
+	 * maximized, so only minimal resources stay reserved. However, we need
+	 * to adapt to a dynamic number of users, as we cannot know how many
+	 * users will talk to a connection. Therefore, the current allocation
+	 * works like this:
+	 * We limit the number of bytes in a destination's pool per sending
+	 * user. The space available for a user is 33% of the unused pool space
+	 * (whereas the space used by the user itself is also treated as
+	 * 'unused'). This way, we favor users coming first, but keep enough
+	 * pool space available for any following users. Given that messages are
+	 * dequeued in FIFO order, this should balance nicely if the number of
+	 * users grows. At the same time, this algorithm guarantees that the
+	 * space available to a connection is reduced dynamically, the more
+	 * concurrent users talk to a connection.
+	 */
+
+	/* per user-accounting is expensive, so we keep state small */
+	BUILD_BUG_ON(sizeof(quota->memory) != 4);
+	BUILD_BUG_ON(sizeof(quota->msgs) != 2);
+	BUILD_BUG_ON(sizeof(quota->fds) != 1);
+	BUILD_BUG_ON(KDBUS_CONN_MAX_MSGS > U16_MAX);
+	BUILD_BUG_ON(KDBUS_CONN_MAX_FDS_PER_USER > U8_MAX);
+
+	id = u ? u->id : KDBUS_USER_KERNEL_ID;
+	if (id >= c->n_quota) {
+		unsigned int users;
+
+		users = max(KDBUS_ALIGN8(id) + 8, id);
+		quota = krealloc(c->quota, users * sizeof(*quota),
+				 GFP_KERNEL | __GFP_ZERO);
+		if (!quota)
+			return -ENOMEM;
+
+		c->n_quota = users;
+		c->quota = quota;
+	}
+
+	quota = &c->quota[id];
+	kdbus_pool_accounted(c->pool, &available, &accounted);
+
+	/* half the pool is _always_ reserved for the pool owner */
+	available /= 2;
+
+	/*
+	 * Pool owner slices are un-accounted slices; they can claim more
+	 * than 50% of the queue. However, the slices we're dealing with here
+	 * belong to the incoming queue, hence they are 'accounted' slices
+	 * to which the 50%-limit applies.
+	 */
+	if (available < accounted)
+		return -ENOBUFS;
+
+	/* 1/3 of the remaining space (including your own memory) */
+	available = (available - accounted + quota->memory) / 3;
+
+	if (available < quota->memory ||
+	    available - quota->memory < memory ||
+	    quota->memory + memory > U32_MAX)
+		return -ENOBUFS;
+	if (quota->msgs >= KDBUS_CONN_MAX_MSGS)
+		return -ENOBUFS;
+	if (quota->fds + fds < quota->fds ||
+	    quota->fds + fds > KDBUS_CONN_MAX_FDS_PER_USER)
+		return -EMFILE;
+
+	quota->memory += memory;
+	quota->fds += fds;
+	++quota->msgs;
+	return 0;
+}
+
+/**
+ * kdbus_conn_quota_dec() - decrease quota accounting
+ * @c:		connection owning the quota tracking
+ * @u:		user which was accounted for (or NULL for kernel accounting)
+ * @memory:	size of memory which was accounted for
+ * @fds:	number of FDs which were accounted for
+ *
+ * This does the reverse of kdbus_conn_quota_inc(). You have to release any
+ * accounted resources that you called kdbus_conn_quota_inc() for. However, you
+ * must not call kdbus_conn_quota_dec() if the accounting failed (that is,
+ * kdbus_conn_quota_inc() failed).
+ */
+void kdbus_conn_quota_dec(struct kdbus_conn *c, struct kdbus_user *u,
+			  size_t memory, size_t fds)
+{
+	struct kdbus_quota *quota;
+	unsigned int id;
+
+	id = u ? u->id : KDBUS_USER_KERNEL_ID;
+	if (WARN_ON(id >= c->n_quota))
+		return;
+
+	quota = &c->quota[id];
+
+	if (!WARN_ON(quota->msgs == 0))
+		--quota->msgs;
+	if (!WARN_ON(quota->memory < memory))
+		quota->memory -= memory;
+	if (!WARN_ON(quota->fds < fds))
+		quota->fds -= fds;
+}
+
+/**
+ * kdbus_conn_lost_message() - handle lost messages
+ * @c:		connection that lost a message
+ *
+ * kdbus is reliable. That means, we try hard to never lose messages. However,
+ * memory is limited, so we cannot rely on transmissions to never fail.
+ * Therefore, we use quota-limits to let callers know if their unicast message
+ * cannot be transmitted to a peer. This works fine for unicasts, but for
+ * broadcasts we cannot make the caller handle the transmission failure.
+ * Instead, we must let the destination know that it couldn't receive a
+ * broadcast.
+ * As this is an unlikely scenario, we keep it simple. A single lost-counter
+ * remembers the number of lost messages since the last call to RECV. The next
+ * message retrieval will notify the connection that it lost messages since the
+ * last message retrieval and thus should resync its state.
+ */
+void kdbus_conn_lost_message(struct kdbus_conn *c)
+{
+	if (atomic_inc_return(&c->lost_count) == 1)
+		wake_up_interruptible(&c->wait);
+}
+
+/* Callers should take the conn_dst lock */
+static struct kdbus_queue_entry *
+kdbus_conn_entry_make(struct kdbus_conn *conn_src,
+		      struct kdbus_conn *conn_dst,
+		      struct kdbus_staging *staging)
+{
+	/* The remote connection was disconnected */
+	if (!kdbus_conn_active(conn_dst))
+		return ERR_PTR(-ECONNRESET);
+
+	/*
+	 * If the connection does not accept file descriptors but the message
+	 * has some attached, refuse it.
+	 *
+	 * If this is a monitor connection, accept the message. In that
+	 * case, all file descriptors will be set to -1 at receive time.
+	 */
+	if (!kdbus_conn_is_monitor(conn_dst) &&
+	    !(conn_dst->flags & KDBUS_HELLO_ACCEPT_FD) &&
+	    staging->gaps && staging->gaps->n_fds > 0)
+		return ERR_PTR(-ECOMM);
+
+	return kdbus_queue_entry_new(conn_src, conn_dst, staging);
+}
+
+/*
+ * Synchronously responding to a message, allocate a queue entry
+ * and attach it to the reply tracking object.
+ * The connection's queue will never get to see it.
+ */
+static int kdbus_conn_entry_sync_attach(struct kdbus_conn *conn_dst,
+					struct kdbus_staging *staging,
+					struct kdbus_reply *reply_wake)
+{
+	struct kdbus_queue_entry *entry;
+	int remote_ret, ret = 0;
+
+	mutex_lock(&reply_wake->reply_dst->lock);
+
+	/*
+	 * If we are still waiting then proceed, allocate a queue
+	 * entry and attach it to the reply object
+	 */
+	if (reply_wake->waiting) {
+		entry = kdbus_conn_entry_make(reply_wake->reply_src, conn_dst,
+					      staging);
+		if (IS_ERR(entry))
+			ret = PTR_ERR(entry);
+		else
+			/* Attach the entry to the reply object */
+			reply_wake->queue_entry = entry;
+	} else {
+		ret = -ECONNRESET;
+	}
+
+	/*
+	 * Update the reply object and wake up remote peer only
+	 * on appropriate return codes
+	 *
+	 * * -ECOMM: if the replying connection failed with -ECOMM
+	 *           then wakeup remote peer with -EREMOTEIO
+	 *
+	 *           We do this to differenciate between -ECOMM errors
+	 *           from the original sender perspective:
+	 *           -ECOMM error during the sync send and
+	 *           -ECOMM error during the sync reply, this last
+	 *           one is rewritten to -EREMOTEIO
+	 *
+	 * * Wake up on all other return codes.
+	 */
+	remote_ret = ret;
+
+	if (ret == -ECOMM)
+		remote_ret = -EREMOTEIO;
+
+	kdbus_sync_reply_wakeup(reply_wake, remote_ret);
+	kdbus_reply_unlink(reply_wake);
+	mutex_unlock(&reply_wake->reply_dst->lock);
+
+	return ret;
+}
+
+/**
+ * kdbus_conn_entry_insert() - enqueue a message into the receiver's pool
+ * @conn_src:		The sending connection
+ * @conn_dst:		The connection to queue into
+ * @staging:		Message to send
+ * @reply:		The reply tracker to attach to the queue entry
+ * @name:		Destination name this msg is sent to, or NULL
+ *
+ * Return: 0 on success. negative error otherwise.
+ */
+int kdbus_conn_entry_insert(struct kdbus_conn *conn_src,
+			    struct kdbus_conn *conn_dst,
+			    struct kdbus_staging *staging,
+			    struct kdbus_reply *reply,
+			    const struct kdbus_name_entry *name)
+{
+	struct kdbus_queue_entry *entry;
+	int ret;
+
+	kdbus_conn_lock2(conn_src, conn_dst);
+
+	entry = kdbus_conn_entry_make(conn_src, conn_dst, staging);
+	if (IS_ERR(entry)) {
+		ret = PTR_ERR(entry);
+		goto exit_unlock;
+	}
+
+	if (reply) {
+		kdbus_reply_link(reply);
+		if (!reply->sync)
+			schedule_delayed_work(&conn_src->work, 0);
+	}
+
+	/*
+	 * Record the sequence number of the registered name; it will
+	 * be remembered by the queue, in case messages addressed to a
+	 * name need to be moved from or to an activator.
+	 */
+	if (name)
+		entry->dst_name_id = name->name_id;
+
+	kdbus_queue_entry_enqueue(entry, reply);
+	wake_up_interruptible(&conn_dst->wait);
+
+	ret = 0;
+
+exit_unlock:
+	kdbus_conn_unlock2(conn_src, conn_dst);
+	return ret;
+}
+
+static int kdbus_conn_wait_reply(struct kdbus_conn *conn_src,
+				 struct kdbus_cmd_send *cmd_send,
+				 struct file *ioctl_file,
+				 struct file *cancel_fd,
+				 struct kdbus_reply *reply_wait,
+				 ktime_t expire)
+{
+	struct kdbus_queue_entry *entry;
+	struct poll_wqueues pwq = {};
+	int ret;
+
+	if (WARN_ON(!reply_wait))
+		return -EIO;
+
+	/*
+	 * Block until the reply arrives. reply_wait is left untouched
+	 * by the timeout scans that might be conducted for other,
+	 * asynchronous replies of conn_src.
+	 */
+
+	poll_initwait(&pwq);
+	poll_wait(ioctl_file, &conn_src->wait, &pwq.pt);
+
+	for (;;) {
+		/*
+		 * Any of the following conditions will stop our synchronously
+		 * blocking SEND command:
+		 *
+		 * a) The origin sender closed its connection
+		 * b) The remote peer answered, setting reply_wait->waiting = 0
+		 * c) The cancel FD was written to
+		 * d) A signal was received
+		 * e) The specified timeout was reached, and none of the above
+		 *    conditions kicked in.
+		 */
+
+		/*
+		 * We have already acquired an active reference when
+		 * entering here, but another thread may call
+		 * KDBUS_CMD_BYEBYE which does not acquire an active
+		 * reference, therefore kdbus_conn_disconnect() will
+		 * not wait for us.
+		 */
+		if (!kdbus_conn_active(conn_src)) {
+			ret = -ECONNRESET;
+			break;
+		}
+
+		/*
+		 * After the replying peer unset the waiting variable
+		 * it will wake up us.
+		 */
+		if (!reply_wait->waiting) {
+			ret = reply_wait->err;
+			break;
+		}
+
+		if (cancel_fd) {
+			unsigned int r;
+
+			r = cancel_fd->f_op->poll(cancel_fd, &pwq.pt);
+			if (r & POLLIN) {
+				ret = -ECANCELED;
+				break;
+			}
+		}
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		if (!poll_schedule_timeout(&pwq, TASK_INTERRUPTIBLE,
+					   &expire, 0)) {
+			ret = -ETIMEDOUT;
+			break;
+		}
+
+		/*
+		 * Reset the poll worker func, so the waitqueues are not
+		 * added to the poll table again. We just reuse what we've
+		 * collected earlier for further iterations.
+		 */
+		init_poll_funcptr(&pwq.pt, NULL);
+	}
+
+	poll_freewait(&pwq);
+
+	if (ret == -EINTR) {
+		/*
+		 * Interrupted system call. Unref the reply object, and pass
+		 * the return value down the chain. Mark the reply as
+		 * interrupted, so the cleanup work can remove it, but do not
+		 * unlink it from the list. Once the syscall restarts, we'll
+		 * pick it up and wait on it again.
+		 */
+		mutex_lock(&conn_src->lock);
+		reply_wait->interrupted = true;
+		schedule_delayed_work(&conn_src->work, 0);
+		mutex_unlock(&conn_src->lock);
+
+		return -ERESTARTSYS;
+	}
+
+	mutex_lock(&conn_src->lock);
+	reply_wait->waiting = false;
+	entry = reply_wait->queue_entry;
+	if (entry) {
+		ret = kdbus_queue_entry_install(entry,
+						&cmd_send->reply.return_flags,
+						true);
+		kdbus_pool_slice_publish(entry->slice, &cmd_send->reply.offset,
+					 &cmd_send->reply.msg_size);
+		kdbus_queue_entry_free(entry);
+	}
+	kdbus_reply_unlink(reply_wait);
+	mutex_unlock(&conn_src->lock);
+
+	return ret;
+}
+
+static int kdbus_pin_dst(struct kdbus_bus *bus,
+			 struct kdbus_staging *staging,
+			 struct kdbus_name_entry **out_name,
+			 struct kdbus_conn **out_dst)
+{
+	const struct kdbus_msg *msg = staging->msg;
+	struct kdbus_name_entry *name = NULL;
+	struct kdbus_conn *dst = NULL;
+	int ret;
+
+	lockdep_assert_held(&bus->name_registry->rwlock);
+
+	if (!staging->dst_name) {
+		dst = kdbus_bus_find_conn_by_id(bus, msg->dst_id);
+		if (!dst)
+			return -ENXIO;
+
+		if (!kdbus_conn_is_ordinary(dst)) {
+			ret = -ENXIO;
+			goto error;
+		}
+	} else {
+		name = kdbus_name_lookup_unlocked(bus->name_registry,
+						  staging->dst_name);
+		if (!name)
+			return -ESRCH;
+
+		/*
+		 * If both a name and a connection ID are given as destination
+		 * of a message, check that the currently owning connection of
+		 * the name matches the specified ID.
+		 * This way, we allow userspace to send the message to a
+		 * specific connection by ID only if the connection currently
+		 * owns the given name.
+		 */
+		if (msg->dst_id != KDBUS_DST_ID_NAME &&
+		    msg->dst_id != name->conn->id)
+			return -EREMCHG;
+
+		if (!name->conn && name->activator)
+			dst = kdbus_conn_ref(name->activator);
+		else
+			dst = kdbus_conn_ref(name->conn);
+
+		if ((msg->flags & KDBUS_MSG_NO_AUTO_START) &&
+		    kdbus_conn_is_activator(dst)) {
+			ret = -EADDRNOTAVAIL;
+			goto error;
+		}
+	}
+
+	*out_name = name;
+	*out_dst = dst;
+	return 0;
+
+error:
+	kdbus_conn_unref(dst);
+	return ret;
+}
+
+static int kdbus_conn_reply(struct kdbus_conn *src,
+			    struct kdbus_staging *staging)
+{
+	const struct kdbus_msg *msg = staging->msg;
+	struct kdbus_name_entry *name = NULL;
+	struct kdbus_reply *reply, *wake = NULL;
+	struct kdbus_conn *dst = NULL;
+	struct kdbus_bus *bus = src->ep->bus;
+	int ret;
+
+	if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) ||
+	    WARN_ON(msg->flags & KDBUS_MSG_EXPECT_REPLY) ||
+	    WARN_ON(msg->flags & KDBUS_MSG_SIGNAL))
+		return -EINVAL;
+
+	/* name-registry must be locked for lookup *and* collecting data */
+	down_read(&bus->name_registry->rwlock);
+
+	/* find and pin destination */
+
+	ret = kdbus_pin_dst(bus, staging, &name, &dst);
+	if (ret < 0)
+		goto exit;
+
+	mutex_lock(&dst->lock);
+	reply = kdbus_reply_find(src, dst, msg->cookie_reply);
+	if (reply) {
+		if (reply->sync)
+			wake = kdbus_reply_ref(reply);
+		kdbus_reply_unlink(reply);
+	}
+	mutex_unlock(&dst->lock);
+
+	if (!reply) {
+		ret = -EPERM;
+		goto exit;
+	}
+
+	/* send message */
+
+	kdbus_bus_eavesdrop(bus, src, staging);
+
+	if (wake)
+		ret = kdbus_conn_entry_sync_attach(dst, staging, wake);
+	else
+		ret = kdbus_conn_entry_insert(src, dst, staging, NULL, name);
+
+exit:
+	up_read(&bus->name_registry->rwlock);
+	kdbus_reply_unref(wake);
+	kdbus_conn_unref(dst);
+	return ret;
+}
+
+static struct kdbus_reply *kdbus_conn_call(struct kdbus_conn *src,
+					   struct kdbus_staging *staging,
+					   ktime_t exp)
+{
+	const struct kdbus_msg *msg = staging->msg;
+	struct kdbus_name_entry *name = NULL;
+	struct kdbus_reply *wait = NULL;
+	struct kdbus_conn *dst = NULL;
+	struct kdbus_bus *bus = src->ep->bus;
+	int ret;
+
+	if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) ||
+	    WARN_ON(msg->flags & KDBUS_MSG_SIGNAL) ||
+	    WARN_ON(!(msg->flags & KDBUS_MSG_EXPECT_REPLY)))
+		return ERR_PTR(-EINVAL);
+
+	/* resume previous wait-context, if available */
+
+	mutex_lock(&src->lock);
+	wait = kdbus_reply_find(NULL, src, msg->cookie);
+	if (wait) {
+		if (wait->interrupted) {
+			kdbus_reply_ref(wait);
+			wait->interrupted = false;
+		} else {
+			wait = NULL;
+		}
+	}
+	mutex_unlock(&src->lock);
+
+	if (wait)
+		return wait;
+
+	if (ktime_compare(ktime_get(), exp) >= 0)
+		return ERR_PTR(-ETIMEDOUT);
+
+	/* name-registry must be locked for lookup *and* collecting data */
+	down_read(&bus->name_registry->rwlock);
+
+	/* find and pin destination */
+
+	ret = kdbus_pin_dst(bus, staging, &name, &dst);
+	if (ret < 0)
+		goto exit;
+
+	if (!kdbus_conn_policy_talk(src, current_cred(), dst)) {
+		ret = -EPERM;
+		goto exit;
+	}
+
+	wait = kdbus_reply_new(dst, src, msg, name, true);
+	if (IS_ERR(wait)) {
+		ret = PTR_ERR(wait);
+		wait = NULL;
+		goto exit;
+	}
+
+	/* send message */
+
+	kdbus_bus_eavesdrop(bus, src, staging);
+
+	ret = kdbus_conn_entry_insert(src, dst, staging, wait, name);
+	if (ret < 0)
+		goto exit;
+
+	ret = 0;
+
+exit:
+	up_read(&bus->name_registry->rwlock);
+	if (ret < 0) {
+		kdbus_reply_unref(wait);
+		wait = ERR_PTR(ret);
+	}
+	kdbus_conn_unref(dst);
+	return wait;
+}
+
+static int kdbus_conn_unicast(struct kdbus_conn *src,
+			      struct kdbus_staging *staging)
+{
+	const struct kdbus_msg *msg = staging->msg;
+	struct kdbus_name_entry *name = NULL;
+	struct kdbus_reply *wait = NULL;
+	struct kdbus_conn *dst = NULL;
+	struct kdbus_bus *bus = src->ep->bus;
+	bool is_signal = (msg->flags & KDBUS_MSG_SIGNAL);
+	int ret = 0;
+
+	if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) ||
+	    WARN_ON(!(msg->flags & KDBUS_MSG_EXPECT_REPLY) &&
+		    msg->cookie_reply != 0))
+		return -EINVAL;
+
+	/* name-registry must be locked for lookup *and* collecting data */
+	down_read(&bus->name_registry->rwlock);
+
+	/* find and pin destination */
+
+	ret = kdbus_pin_dst(bus, staging, &name, &dst);
+	if (ret < 0)
+		goto exit;
+
+	if (is_signal) {
+		/* like broadcasts we eavesdrop even if the msg is dropped */
+		kdbus_bus_eavesdrop(bus, src, staging);
+
+		/* drop silently if peer is not interested or not privileged */
+		if (!kdbus_match_db_match_msg(dst->match_db, src, staging) ||
+		    !kdbus_conn_policy_talk(dst, NULL, src))
+			goto exit;
+	} else if (!kdbus_conn_policy_talk(src, current_cred(), dst)) {
+		ret = -EPERM;
+		goto exit;
+	} else if (msg->flags & KDBUS_MSG_EXPECT_REPLY) {
+		wait = kdbus_reply_new(dst, src, msg, name, false);
+		if (IS_ERR(wait)) {
+			ret = PTR_ERR(wait);
+			wait = NULL;
+			goto exit;
+		}
+	}
+
+	/* send message */
+
+	if (!is_signal)
+		kdbus_bus_eavesdrop(bus, src, staging);
+
+	ret = kdbus_conn_entry_insert(src, dst, staging, wait, name);
+	if (ret < 0 && !is_signal)
+		goto exit;
+
+	/* signals are treated like broadcasts, recv-errors are ignored */
+	ret = 0;
+
+exit:
+	up_read(&bus->name_registry->rwlock);
+	kdbus_reply_unref(wait);
+	kdbus_conn_unref(dst);
+	return ret;
+}
+
+/**
+ * kdbus_conn_move_messages() - move messages from one connection to another
+ * @conn_dst:		Connection to copy to
+ * @conn_src:		Connection to copy from
+ * @name_id:		Filter for the sequence number of the registered
+ *			name, 0 means no filtering.
+ *
+ * Move all messages from one connection to another. This is used when
+ * an implementer connection is taking over/giving back a well-known name
+ * from/to an activator connection.
+ */
+void kdbus_conn_move_messages(struct kdbus_conn *conn_dst,
+			      struct kdbus_conn *conn_src,
+			      u64 name_id)
+{
+	struct kdbus_queue_entry *e, *e_tmp;
+	struct kdbus_reply *r, *r_tmp;
+	struct kdbus_bus *bus;
+	struct kdbus_conn *c;
+	LIST_HEAD(msg_list);
+	int i, ret = 0;
+
+	if (WARN_ON(conn_src == conn_dst))
+		return;
+
+	bus = conn_src->ep->bus;
+
+	/* lock order: domain -> bus -> ep -> names -> conn */
+	down_read(&bus->conn_rwlock);
+	hash_for_each(bus->conn_hash, i, c, hentry) {
+		if (c == conn_src || c == conn_dst)
+			continue;
+
+		mutex_lock(&c->lock);
+		list_for_each_entry_safe(r, r_tmp, &c->reply_list, entry) {
+			if (r->reply_src != conn_src)
+				continue;
+
+			/* filter messages for a specific name */
+			if (name_id > 0 && r->name_id != name_id)
+				continue;
+
+			kdbus_conn_unref(r->reply_src);
+			r->reply_src = kdbus_conn_ref(conn_dst);
+		}
+		mutex_unlock(&c->lock);
+	}
+	up_read(&bus->conn_rwlock);
+
+	kdbus_conn_lock2(conn_src, conn_dst);
+	list_for_each_entry_safe(e, e_tmp, &conn_src->queue.msg_list, entry) {
+		/* filter messages for a specific name */
+		if (name_id > 0 && e->dst_name_id != name_id)
+			continue;
+
+		if (!(conn_dst->flags & KDBUS_HELLO_ACCEPT_FD) &&
+		    e->gaps && e->gaps->n_fds > 0) {
+			kdbus_conn_lost_message(conn_dst);
+			kdbus_queue_entry_free(e);
+			continue;
+		}
+
+		ret = kdbus_queue_entry_move(e, conn_dst);
+		if (ret < 0) {
+			kdbus_conn_lost_message(conn_dst);
+			kdbus_queue_entry_free(e);
+			continue;
+		}
+	}
+	kdbus_conn_unlock2(conn_src, conn_dst);
+
+	/* wake up poll() */
+	wake_up_interruptible(&conn_dst->wait);
+}
+
+/* query the policy-database for all names of @whom */
+static bool kdbus_conn_policy_query_all(struct kdbus_conn *conn,
+					const struct cred *conn_creds,
+					struct kdbus_policy_db *db,
+					struct kdbus_conn *whom,
+					unsigned int access)
+{
+	struct kdbus_name_entry *ne;
+	bool pass = false;
+	int res;
+
+	lockdep_assert_held(&conn->ep->bus->name_registry->rwlock);
+
+	down_read(&db->entries_rwlock);
+	mutex_lock(&whom->lock);
+
+	list_for_each_entry(ne, &whom->names_list, conn_entry) {
+		res = kdbus_policy_query_unlocked(db, conn_creds ? : conn->cred,
+						  ne->name,
+						  kdbus_strhash(ne->name));
+		if (res >= (int)access) {
+			pass = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&whom->lock);
+	up_read(&db->entries_rwlock);
+
+	return pass;
+}
+
+/**
+ * kdbus_conn_policy_own_name() - verify a connection can own the given name
+ * @conn:		Connection
+ * @conn_creds:		Credentials of @conn to use for policy check
+ * @name:		Name
+ *
+ * This verifies that @conn is allowed to acquire the well-known name @name.
+ *
+ * Return: true if allowed, false if not.
+ */
+bool kdbus_conn_policy_own_name(struct kdbus_conn *conn,
+				const struct cred *conn_creds,
+				const char *name)
+{
+	unsigned int hash = kdbus_strhash(name);
+	int res;
+
+	if (!conn_creds)
+		conn_creds = conn->cred;
+
+	if (conn->ep->user) {
+		res = kdbus_policy_query(&conn->ep->policy_db, conn_creds,
+					 name, hash);
+		if (res < KDBUS_POLICY_OWN)
+			return false;
+	}
+
+	if (conn->privileged)
+		return true;
+
+	res = kdbus_policy_query(&conn->ep->bus->policy_db, conn_creds,
+				 name, hash);
+	return res >= KDBUS_POLICY_OWN;
+}
+
+/**
+ * kdbus_conn_policy_talk() - verify a connection can talk to a given peer
+ * @conn:		Connection that tries to talk
+ * @conn_creds:		Credentials of @conn to use for policy check
+ * @to:			Connection that is talked to
+ *
+ * This verifies that @conn is allowed to talk to @to.
+ *
+ * Return: true if allowed, false if not.
+ */
+bool kdbus_conn_policy_talk(struct kdbus_conn *conn,
+			    const struct cred *conn_creds,
+			    struct kdbus_conn *to)
+{
+	if (!conn_creds)
+		conn_creds = conn->cred;
+
+	if (conn->ep->user &&
+	    !kdbus_conn_policy_query_all(conn, conn_creds, &conn->ep->policy_db,
+					 to, KDBUS_POLICY_TALK))
+		return false;
+
+	if (conn->privileged)
+		return true;
+	if (uid_eq(conn_creds->euid, to->cred->uid))
+		return true;
+
+	return kdbus_conn_policy_query_all(conn, conn_creds,
+					   &conn->ep->bus->policy_db, to,
+					   KDBUS_POLICY_TALK);
+}
+
+/**
+ * kdbus_conn_policy_see_name_unlocked() - verify a connection can see a given
+ *					   name
+ * @conn:		Connection
+ * @conn_creds:		Credentials of @conn to use for policy check
+ * @name:		Name
+ *
+ * This verifies that @conn is allowed to see the well-known name @name. Caller
+ * must hold policy-lock.
+ *
+ * Return: true if allowed, false if not.
+ */
+bool kdbus_conn_policy_see_name_unlocked(struct kdbus_conn *conn,
+					 const struct cred *conn_creds,
+					 const char *name)
+{
+	int res;
+
+	/*
+	 * By default, all names are visible on a bus. SEE policies can only be
+	 * installed on custom endpoints, where by default no name is visible.
+	 */
+	if (!conn->ep->user)
+		return true;
+
+	res = kdbus_policy_query_unlocked(&conn->ep->policy_db,
+					  conn_creds ? : conn->cred,
+					  name, kdbus_strhash(name));
+	return res >= KDBUS_POLICY_SEE;
+}
+
+static bool kdbus_conn_policy_see_name(struct kdbus_conn *conn,
+				       const struct cred *conn_creds,
+				       const char *name)
+{
+	bool res;
+
+	down_read(&conn->ep->policy_db.entries_rwlock);
+	res = kdbus_conn_policy_see_name_unlocked(conn, conn_creds, name);
+	up_read(&conn->ep->policy_db.entries_rwlock);
+
+	return res;
+}
+
+static bool kdbus_conn_policy_see(struct kdbus_conn *conn,
+				  const struct cred *conn_creds,
+				  struct kdbus_conn *whom)
+{
+	/*
+	 * By default, all names are visible on a bus, so a connection can
+	 * always see other connections. SEE policies can only be installed on
+	 * custom endpoints, where by default no name is visible and we hide
+	 * peers from each other, unless you see at least _one_ name of the
+	 * peer.
+	 */
+	return !conn->ep->user ||
+	       kdbus_conn_policy_query_all(conn, conn_creds,
+					   &conn->ep->policy_db, whom,
+					   KDBUS_POLICY_SEE);
+}
+
+/**
+ * kdbus_conn_policy_see_notification() - verify a connection is allowed to
+ *					  receive a given kernel notification
+ * @conn:		Connection
+ * @conn_creds:		Credentials of @conn to use for policy check
+ * @msg:		Notification message
+ *
+ * This checks whether @conn is allowed to see the kernel notification.
+ *
+ * Return: true if allowed, false if not.
+ */
+bool kdbus_conn_policy_see_notification(struct kdbus_conn *conn,
+					const struct cred *conn_creds,
+					const struct kdbus_msg *msg)
+{
+	/*
+	 * Depending on the notification type, broadcasted kernel notifications
+	 * have to be filtered:
+	 *
+	 * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}: This notification is forwarded
+	 *     to a peer if, and only if, that peer can see the name this
+	 *     notification is for.
+	 *
+	 * KDBUS_ITEM_ID_{ADD,REMOVE}: Notifications for ID changes are
+	 *     broadcast to everyone, to allow tracking peers.
+	 */
+
+	switch (msg->items[0].type) {
+	case KDBUS_ITEM_NAME_ADD:
+	case KDBUS_ITEM_NAME_REMOVE:
+	case KDBUS_ITEM_NAME_CHANGE:
+		return kdbus_conn_policy_see_name(conn, conn_creds,
+					msg->items[0].name_change.name);
+
+	case KDBUS_ITEM_ID_ADD:
+	case KDBUS_ITEM_ID_REMOVE:
+		return true;
+
+	default:
+		WARN(1, "Invalid type for notification broadcast: %llu\n",
+		     (unsigned long long)msg->items[0].type);
+		return false;
+	}
+}
+
+/**
+ * kdbus_cmd_hello() - handle KDBUS_CMD_HELLO
+ * @ep:			Endpoint to operate on
+ * @privileged:		Whether the caller is privileged
+ * @argp:		Command payload
+ *
+ * Return: NULL or newly created connection on success, ERR_PTR on failure.
+ */
+struct kdbus_conn *kdbus_cmd_hello(struct kdbus_ep *ep, bool privileged,
+				   void __user *argp)
+{
+	struct kdbus_cmd_hello *cmd;
+	struct kdbus_conn *c = NULL;
+	const char *item_name;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_NAME },
+		{ .type = KDBUS_ITEM_CREDS },
+		{ .type = KDBUS_ITEM_PIDS },
+		{ .type = KDBUS_ITEM_SECLABEL },
+		{ .type = KDBUS_ITEM_CONN_DESCRIPTION },
+		{ .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE |
+				 KDBUS_HELLO_ACCEPT_FD |
+				 KDBUS_HELLO_ACTIVATOR |
+				 KDBUS_HELLO_POLICY_HOLDER |
+				 KDBUS_HELLO_MONITOR,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+
+	item_name = argv[1].item ? argv[1].item->str : NULL;
+
+	c = kdbus_conn_new(ep, privileged, cmd, item_name,
+			   argv[2].item ? &argv[2].item->creds : NULL,
+			   argv[3].item ? &argv[3].item->pids : NULL,
+			   argv[4].item ? argv[4].item->str : NULL,
+			   argv[5].item ? argv[5].item->str : NULL);
+	if (IS_ERR(c)) {
+		ret = PTR_ERR(c);
+		c = NULL;
+		goto exit;
+	}
+
+	ret = kdbus_conn_connect(c, item_name);
+	if (ret < 0)
+		goto exit;
+
+	if (kdbus_conn_is_activator(c) || kdbus_conn_is_policy_holder(c)) {
+		ret = kdbus_conn_acquire(c);
+		if (ret < 0)
+			goto exit;
+
+		ret = kdbus_policy_set(&c->ep->bus->policy_db, args.items,
+				       args.items_size, 1,
+				       kdbus_conn_is_policy_holder(c), c);
+		kdbus_conn_release(c);
+		if (ret < 0)
+			goto exit;
+	}
+
+	if (copy_to_user(argp, cmd, sizeof(*cmd)))
+		ret = -EFAULT;
+
+exit:
+	ret = kdbus_args_clear(&args, ret);
+	if (ret < 0) {
+		if (c) {
+			kdbus_conn_disconnect(c, false);
+			kdbus_conn_unref(c);
+		}
+		return ERR_PTR(ret);
+	}
+	return c;
+}
+
+/**
+ * kdbus_cmd_byebye_unlocked() - handle KDBUS_CMD_BYEBYE
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * The caller must not hold any active reference to @conn or this will deadlock.
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_byebye_unlocked(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_cmd *cmd;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	if (!kdbus_conn_is_ordinary(conn))
+		return -EOPNOTSUPP;
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	ret = kdbus_conn_disconnect(conn, true);
+	return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_conn_info() - handle KDBUS_CMD_CONN_INFO
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_conn_info(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_meta_conn *conn_meta = NULL;
+	struct kdbus_pool_slice *slice = NULL;
+	struct kdbus_name_entry *entry = NULL;
+	struct kdbus_conn *owner_conn = NULL;
+	struct kdbus_item *meta_items = NULL;
+	struct kdbus_info info = {};
+	struct kdbus_cmd_info *cmd;
+	struct kdbus_bus *bus = conn->ep->bus;
+	struct kvec kvec[3];
+	size_t meta_size, cnt = 0;
+	const char *name;
+	u64 attach_flags, size = 0;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_NAME },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	/* registry must be held throughout lookup *and* collecting data */
+	down_read(&bus->name_registry->rwlock);
+
+	ret = kdbus_sanitize_attach_flags(cmd->attach_flags, &attach_flags);
+	if (ret < 0)
+		goto exit;
+
+	name = argv[1].item ? argv[1].item->str : NULL;
+
+	if (name) {
+		entry = kdbus_name_lookup_unlocked(bus->name_registry, name);
+		if (!entry || !entry->conn ||
+		    !kdbus_conn_policy_see_name(conn, current_cred(), name) ||
+		    (cmd->id != 0 && entry->conn->id != cmd->id)) {
+			/* pretend a name doesn't exist if you cannot see it */
+			ret = -ESRCH;
+			goto exit;
+		}
+
+		owner_conn = kdbus_conn_ref(entry->conn);
+	} else if (cmd->id > 0) {
+		owner_conn = kdbus_bus_find_conn_by_id(bus, cmd->id);
+		if (!owner_conn || !kdbus_conn_policy_see(conn, current_cred(),
+							  owner_conn)) {
+			/* pretend an id doesn't exist if you cannot see it */
+			ret = -ENXIO;
+			goto exit;
+		}
+	} else {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	attach_flags &= atomic64_read(&owner_conn->attach_flags_send);
+
+	conn_meta = kdbus_meta_conn_new();
+	if (IS_ERR(conn_meta)) {
+		ret = PTR_ERR(conn_meta);
+		conn_meta = NULL;
+		goto exit;
+	}
+
+	ret = kdbus_meta_conn_collect(conn_meta, owner_conn, 0, attach_flags);
+	if (ret < 0)
+		goto exit;
+
+	ret = kdbus_meta_emit(owner_conn->meta_proc, owner_conn->meta_fake,
+			      conn_meta, conn, attach_flags,
+			      &meta_items, &meta_size);
+	if (ret < 0)
+		goto exit;
+
+	info.id = owner_conn->id;
+	info.flags = owner_conn->flags;
+
+	kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &size);
+	if (meta_size > 0) {
+		kdbus_kvec_set(&kvec[cnt++], meta_items, meta_size, &size);
+		cnt += !!kdbus_kvec_pad(&kvec[cnt], &size);
+	}
+
+	info.size = size;
+
+	slice = kdbus_pool_slice_alloc(conn->pool, size, false);
+	if (IS_ERR(slice)) {
+		ret = PTR_ERR(slice);
+		slice = NULL;
+		goto exit;
+	}
+
+	ret = kdbus_pool_slice_copy_kvec(slice, 0, kvec, cnt, size);
+	if (ret < 0)
+		goto exit;
+
+	kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->info_size);
+
+	if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) ||
+	    kdbus_member_set_user(&cmd->info_size, argp,
+				  typeof(*cmd), info_size)) {
+		ret = -EFAULT;
+		goto exit;
+	}
+
+	ret = 0;
+
+exit:
+	up_read(&bus->name_registry->rwlock);
+	kdbus_pool_slice_release(slice);
+	kfree(meta_items);
+	kdbus_meta_conn_unref(conn_meta);
+	kdbus_conn_unref(owner_conn);
+	return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_update() - handle KDBUS_CMD_UPDATE
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_update(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_item *item_policy;
+	u64 *item_attach_send = NULL;
+	u64 *item_attach_recv = NULL;
+	struct kdbus_cmd *cmd;
+	u64 attach_send;
+	u64 attach_recv;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_ATTACH_FLAGS_SEND },
+		{ .type = KDBUS_ITEM_ATTACH_FLAGS_RECV },
+		{ .type = KDBUS_ITEM_NAME, .multiple = true },
+		{ .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	item_attach_send = argv[1].item ? &argv[1].item->data64[0] : NULL;
+	item_attach_recv = argv[2].item ? &argv[2].item->data64[0] : NULL;
+	item_policy = argv[3].item ? : argv[4].item;
+
+	if (item_attach_send) {
+		if (!kdbus_conn_is_ordinary(conn) &&
+		    !kdbus_conn_is_monitor(conn)) {
+			ret = -EOPNOTSUPP;
+			goto exit;
+		}
+
+		ret = kdbus_sanitize_attach_flags(*item_attach_send,
+						  &attach_send);
+		if (ret < 0)
+			goto exit;
+	}
+
+	if (item_attach_recv) {
+		if (!kdbus_conn_is_ordinary(conn) &&
+		    !kdbus_conn_is_monitor(conn) &&
+		    !kdbus_conn_is_activator(conn)) {
+			ret = -EOPNOTSUPP;
+			goto exit;
+		}
+
+		ret = kdbus_sanitize_attach_flags(*item_attach_recv,
+						  &attach_recv);
+		if (ret < 0)
+			goto exit;
+	}
+
+	if (item_policy && !kdbus_conn_is_policy_holder(conn)) {
+		ret = -EOPNOTSUPP;
+		goto exit;
+	}
+
+	/* now that we verified the input, update the connection */
+
+	if (item_policy) {
+		ret = kdbus_policy_set(&conn->ep->bus->policy_db, cmd->items,
+				       KDBUS_ITEMS_SIZE(cmd, items),
+				       1, true, conn);
+		if (ret < 0)
+			goto exit;
+	}
+
+	if (item_attach_send)
+		atomic64_set(&conn->attach_flags_send, attach_send);
+
+	if (item_attach_recv)
+		atomic64_set(&conn->attach_flags_recv, attach_recv);
+
+exit:
+	return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_send() - handle KDBUS_CMD_SEND
+ * @conn:		connection to operate on
+ * @f:			file this command was called on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_send(struct kdbus_conn *conn, struct file *f, void __user *argp)
+{
+	struct kdbus_cmd_send *cmd;
+	struct kdbus_staging *staging = NULL;
+	struct kdbus_msg *msg = NULL;
+	struct file *cancel_fd = NULL;
+	int ret, ret2;
+
+	/* command arguments */
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_CANCEL_FD },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE |
+				 KDBUS_SEND_SYNC_REPLY,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	/* message arguments */
+	struct kdbus_arg msg_argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_PAYLOAD_VEC, .multiple = true },
+		{ .type = KDBUS_ITEM_PAYLOAD_MEMFD, .multiple = true },
+		{ .type = KDBUS_ITEM_FDS },
+		{ .type = KDBUS_ITEM_BLOOM_FILTER },
+		{ .type = KDBUS_ITEM_DST_NAME },
+	};
+	struct kdbus_args msg_args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE |
+				 KDBUS_MSG_EXPECT_REPLY |
+				 KDBUS_MSG_NO_AUTO_START |
+				 KDBUS_MSG_SIGNAL,
+		.argv = msg_argv,
+		.argc = ARRAY_SIZE(msg_argv),
+	};
+
+	if (!kdbus_conn_is_ordinary(conn))
+		return -EOPNOTSUPP;
+
+	/* make sure to parse both, @cmd and @msg on negotiation */
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret < 0)
+		goto exit;
+	else if (ret > 0 && !cmd->msg_address) /* negotiation without msg */
+		goto exit;
+
+	ret2 = kdbus_args_parse_msg(&msg_args, KDBUS_PTR(cmd->msg_address),
+				    &msg);
+	if (ret2 < 0) { /* cannot parse message */
+		ret = ret2;
+		goto exit;
+	} else if (ret2 > 0 && !ret) { /* msg-negot implies cmd-negot */
+		ret = -EINVAL;
+		goto exit;
+	} else if (ret > 0) { /* negotiation */
+		goto exit;
+	}
+
+	/* here we parsed both, @cmd and @msg, and neither wants negotiation */
+
+	cmd->reply.return_flags = 0;
+	kdbus_pool_publish_empty(conn->pool, &cmd->reply.offset,
+				 &cmd->reply.msg_size);
+
+	if (argv[1].item) {
+		cancel_fd = fget(argv[1].item->fds[0]);
+		if (!cancel_fd) {
+			ret = -EBADF;
+			goto exit;
+		}
+
+		if (!cancel_fd->f_op->poll) {
+			ret = -EINVAL;
+			goto exit;
+		}
+	}
+
+	/* patch-in the source of this message */
+	if (msg->src_id > 0 && msg->src_id != conn->id) {
+		ret = -EINVAL;
+		goto exit;
+	}
+	msg->src_id = conn->id;
+
+	staging = kdbus_staging_new_user(conn->ep->bus, cmd, msg);
+	if (IS_ERR(staging)) {
+		ret = PTR_ERR(staging);
+		staging = NULL;
+		goto exit;
+	}
+
+	if (msg->dst_id == KDBUS_DST_ID_BROADCAST) {
+		down_read(&conn->ep->bus->name_registry->rwlock);
+		kdbus_bus_broadcast(conn->ep->bus, conn, staging);
+		up_read(&conn->ep->bus->name_registry->rwlock);
+	} else if (cmd->flags & KDBUS_SEND_SYNC_REPLY) {
+		struct kdbus_reply *r;
+		ktime_t exp;
+
+		exp = ns_to_ktime(msg->timeout_ns);
+		r = kdbus_conn_call(conn, staging, exp);
+		if (IS_ERR(r)) {
+			ret = PTR_ERR(r);
+			goto exit;
+		}
+
+		ret = kdbus_conn_wait_reply(conn, cmd, f, cancel_fd, r, exp);
+		kdbus_reply_unref(r);
+		if (ret < 0)
+			goto exit;
+	} else if ((msg->flags & KDBUS_MSG_EXPECT_REPLY) ||
+		   msg->cookie_reply == 0) {
+		ret = kdbus_conn_unicast(conn, staging);
+		if (ret < 0)
+			goto exit;
+	} else {
+		ret = kdbus_conn_reply(conn, staging);
+		if (ret < 0)
+			goto exit;
+	}
+
+	if (kdbus_member_set_user(&cmd->reply, argp, typeof(*cmd), reply))
+		ret = -EFAULT;
+
+exit:
+	if (cancel_fd)
+		fput(cancel_fd);
+	kdbus_staging_free(staging);
+	ret = kdbus_args_clear(&msg_args, ret);
+	return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_recv() - handle KDBUS_CMD_RECV
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_recv(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_queue_entry *entry;
+	struct kdbus_cmd_recv *cmd;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE |
+				 KDBUS_RECV_PEEK |
+				 KDBUS_RECV_DROP |
+				 KDBUS_RECV_USE_PRIORITY,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	if (!kdbus_conn_is_ordinary(conn) &&
+	    !kdbus_conn_is_monitor(conn) &&
+	    !kdbus_conn_is_activator(conn))
+		return -EOPNOTSUPP;
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	cmd->dropped_msgs = 0;
+	cmd->msg.return_flags = 0;
+	kdbus_pool_publish_empty(conn->pool, &cmd->msg.offset,
+				 &cmd->msg.msg_size);
+
+	/* DROP+priority is not realiably, so prevent it */
+	if ((cmd->flags & KDBUS_RECV_DROP) &&
+	    (cmd->flags & KDBUS_RECV_USE_PRIORITY)) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	mutex_lock(&conn->lock);
+
+	entry = kdbus_queue_peek(&conn->queue, cmd->priority,
+				 cmd->flags & KDBUS_RECV_USE_PRIORITY);
+	if (!entry) {
+		mutex_unlock(&conn->lock);
+		ret = -EAGAIN;
+	} else if (cmd->flags & KDBUS_RECV_DROP) {
+		struct kdbus_reply *reply = kdbus_reply_ref(entry->reply);
+
+		kdbus_queue_entry_free(entry);
+
+		mutex_unlock(&conn->lock);
+
+		if (reply) {
+			mutex_lock(&reply->reply_dst->lock);
+			if (!list_empty(&reply->entry)) {
+				kdbus_reply_unlink(reply);
+				if (reply->sync)
+					kdbus_sync_reply_wakeup(reply, -EPIPE);
+				else
+					kdbus_notify_reply_dead(conn->ep->bus,
+							reply->reply_dst->id,
+							reply->cookie);
+			}
+			mutex_unlock(&reply->reply_dst->lock);
+			kdbus_notify_flush(conn->ep->bus);
+		}
+
+		kdbus_reply_unref(reply);
+	} else {
+		bool install_fds;
+
+		/*
+		 * PEEK just returns the location of the next message. Do not
+		 * install FDs nor memfds nor anything else. The only
+		 * information of interest should be the message header and
+		 * metadata. Any FD numbers in the payload is undefined for
+		 * PEEK'ed messages.
+		 * Also make sure to never install fds into a connection that
+		 * has refused to receive any. Ordinary connections will not get
+		 * messages with FDs queued (the receiver will get -ECOMM), but
+		 * eavesdroppers might.
+		 */
+		install_fds = (conn->flags & KDBUS_HELLO_ACCEPT_FD) &&
+			      !(cmd->flags & KDBUS_RECV_PEEK);
+
+		ret = kdbus_queue_entry_install(entry,
+						&cmd->msg.return_flags,
+						install_fds);
+		if (ret < 0) {
+			mutex_unlock(&conn->lock);
+			goto exit;
+		}
+
+		kdbus_pool_slice_publish(entry->slice, &cmd->msg.offset,
+					 &cmd->msg.msg_size);
+
+		if (!(cmd->flags & KDBUS_RECV_PEEK))
+			kdbus_queue_entry_free(entry);
+
+		mutex_unlock(&conn->lock);
+	}
+
+	cmd->dropped_msgs = atomic_xchg(&conn->lost_count, 0);
+	if (cmd->dropped_msgs > 0)
+		cmd->return_flags |= KDBUS_RECV_RETURN_DROPPED_MSGS;
+
+	if (kdbus_member_set_user(&cmd->msg, argp, typeof(*cmd), msg) ||
+	    kdbus_member_set_user(&cmd->dropped_msgs, argp, typeof(*cmd),
+				  dropped_msgs))
+		ret = -EFAULT;
+
+exit:
+	return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_free() - handle KDBUS_CMD_FREE
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_free(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_cmd_free *cmd;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	if (!kdbus_conn_is_ordinary(conn) &&
+	    !kdbus_conn_is_monitor(conn) &&
+	    !kdbus_conn_is_activator(conn))
+		return -EOPNOTSUPP;
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	ret = kdbus_pool_release_offset(conn->pool, cmd->offset);
+
+	return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/connection.h b/ipc/kdbus/connection.h
new file mode 100644
index 000000000..5ee864eb0
--- /dev/null
+++ b/ipc/kdbus/connection.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_CONNECTION_H
+#define __KDBUS_CONNECTION_H
+
+#include <linux/atomic.h>
+#include <linux/kref.h>
+#include <linux/lockdep.h>
+#include <linux/path.h>
+
+#include "limits.h"
+#include "metadata.h"
+#include "pool.h"
+#include "queue.h"
+#include "util.h"
+
+#define KDBUS_HELLO_SPECIAL_CONN	(KDBUS_HELLO_ACTIVATOR | \
+					 KDBUS_HELLO_POLICY_HOLDER | \
+					 KDBUS_HELLO_MONITOR)
+
+struct kdbus_quota;
+struct kdbus_staging;
+
+/**
+ * struct kdbus_conn - connection to a bus
+ * @kref:		Reference count
+ * @active:		Active references to the connection
+ * @id:			Connection ID
+ * @flags:		KDBUS_HELLO_* flags
+ * @attach_flags_send:	KDBUS_ATTACH_* flags for sending
+ * @attach_flags_recv:	KDBUS_ATTACH_* flags for receiving
+ * @description:	Human-readable connection description, used for
+ *			debugging. This field is only set when the
+ *			connection is created.
+ * @ep:			The endpoint this connection belongs to
+ * @lock:		Connection data lock
+ * @hentry:		Entry in ID <-> connection map
+ * @ep_entry:		Entry in endpoint
+ * @monitor_entry:	Entry in monitor, if the connection is a monitor
+ * @reply_list:		List of connections this connection should
+ *			reply to
+ * @work:		Delayed work to handle timeouts
+ *			activator for
+ * @match_db:		Subscription filter to broadcast messages
+ * @meta_proc:		Process metadata of connection creator, or NULL
+ * @meta_fake:		Faked metadata, or NULL
+ * @pool:		The user's buffer to receive messages
+ * @user:		Owner of the connection
+ * @cred:		The credentials of the connection at creation time
+ * @pid:		Pid at creation time
+ * @root_path:		Root path at creation time
+ * @name_count:		Number of owned well-known names
+ * @request_count:	Number of pending requests issued by this
+ *			connection that are waiting for replies from
+ *			other peers
+ * @lost_count:		Number of lost broadcast messages
+ * @wait:		Wake up this endpoint
+ * @queue:		The message queue associated with this connection
+ * @quota:		Array of per-user quota indexed by user->id
+ * @n_quota:		Number of elements in quota array
+ * @activator_of:	Well-known name entry this connection acts as an
+ * @names_list:		List of well-known names
+ * @names_queue_list:	Well-known names this connection waits for
+ * @privileged:		Whether this connection is privileged on the bus
+ */
+struct kdbus_conn {
+	struct kref kref;
+	atomic_t active;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+	u64 id;
+	u64 flags;
+	atomic64_t attach_flags_send;
+	atomic64_t attach_flags_recv;
+	const char *description;
+	struct kdbus_ep *ep;
+	struct mutex lock;
+	struct hlist_node hentry;
+	struct list_head ep_entry;
+	struct list_head monitor_entry;
+	struct list_head reply_list;
+	struct delayed_work work;
+	struct kdbus_match_db *match_db;
+	struct kdbus_meta_proc *meta_proc;
+	struct kdbus_meta_fake *meta_fake;
+	struct kdbus_pool *pool;
+	struct kdbus_user *user;
+	const struct cred *cred;
+	struct pid *pid;
+	struct path root_path;
+	atomic_t name_count;
+	atomic_t request_count;
+	atomic_t lost_count;
+	wait_queue_head_t wait;
+	struct kdbus_queue queue;
+
+	struct kdbus_quota *quota;
+	unsigned int n_quota;
+
+	/* protected by registry->rwlock */
+	struct kdbus_name_entry *activator_of;
+	struct list_head names_list;
+	struct list_head names_queue_list;
+
+	bool privileged:1;
+};
+
+struct kdbus_conn *kdbus_conn_ref(struct kdbus_conn *conn);
+struct kdbus_conn *kdbus_conn_unref(struct kdbus_conn *conn);
+bool kdbus_conn_active(const struct kdbus_conn *conn);
+int kdbus_conn_acquire(struct kdbus_conn *conn);
+void kdbus_conn_release(struct kdbus_conn *conn);
+int kdbus_conn_disconnect(struct kdbus_conn *conn, bool ensure_queue_empty);
+bool kdbus_conn_has_name(struct kdbus_conn *conn, const char *name);
+int kdbus_conn_quota_inc(struct kdbus_conn *c, struct kdbus_user *u,
+			 size_t memory, size_t fds);
+void kdbus_conn_quota_dec(struct kdbus_conn *c, struct kdbus_user *u,
+			  size_t memory, size_t fds);
+void kdbus_conn_lost_message(struct kdbus_conn *c);
+int kdbus_conn_entry_insert(struct kdbus_conn *conn_src,
+			    struct kdbus_conn *conn_dst,
+			    struct kdbus_staging *staging,
+			    struct kdbus_reply *reply,
+			    const struct kdbus_name_entry *name);
+void kdbus_conn_move_messages(struct kdbus_conn *conn_dst,
+			      struct kdbus_conn *conn_src,
+			      u64 name_id);
+
+/* policy */
+bool kdbus_conn_policy_own_name(struct kdbus_conn *conn,
+				const struct cred *conn_creds,
+				const char *name);
+bool kdbus_conn_policy_talk(struct kdbus_conn *conn,
+			    const struct cred *conn_creds,
+			    struct kdbus_conn *to);
+bool kdbus_conn_policy_see_name_unlocked(struct kdbus_conn *conn,
+					 const struct cred *curr_creds,
+					 const char *name);
+bool kdbus_conn_policy_see_notification(struct kdbus_conn *conn,
+					const struct cred *curr_creds,
+					const struct kdbus_msg *msg);
+
+/* command dispatcher */
+struct kdbus_conn *kdbus_cmd_hello(struct kdbus_ep *ep, bool privileged,
+				   void __user *argp);
+int kdbus_cmd_byebye_unlocked(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_conn_info(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_update(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_send(struct kdbus_conn *conn, struct file *f, void __user *argp);
+int kdbus_cmd_recv(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_free(struct kdbus_conn *conn, void __user *argp);
+
+/**
+ * kdbus_conn_is_ordinary() - Check if connection is ordinary
+ * @conn:		The connection to check
+ *
+ * Return: Non-zero if the connection is an ordinary connection
+ */
+static inline int kdbus_conn_is_ordinary(const struct kdbus_conn *conn)
+{
+	return !(conn->flags & KDBUS_HELLO_SPECIAL_CONN);
+}
+
+/**
+ * kdbus_conn_is_activator() - Check if connection is an activator
+ * @conn:		The connection to check
+ *
+ * Return: Non-zero if the connection is an activator
+ */
+static inline int kdbus_conn_is_activator(const struct kdbus_conn *conn)
+{
+	return conn->flags & KDBUS_HELLO_ACTIVATOR;
+}
+
+/**
+ * kdbus_conn_is_policy_holder() - Check if connection is a policy holder
+ * @conn:		The connection to check
+ *
+ * Return: Non-zero if the connection is a policy holder
+ */
+static inline int kdbus_conn_is_policy_holder(const struct kdbus_conn *conn)
+{
+	return conn->flags & KDBUS_HELLO_POLICY_HOLDER;
+}
+
+/**
+ * kdbus_conn_is_monitor() - Check if connection is a monitor
+ * @conn:		The connection to check
+ *
+ * Return: Non-zero if the connection is a monitor
+ */
+static inline int kdbus_conn_is_monitor(const struct kdbus_conn *conn)
+{
+	return conn->flags & KDBUS_HELLO_MONITOR;
+}
+
+/**
+ * kdbus_conn_lock2() - Lock two connections
+ * @a:		connection A to lock or NULL
+ * @b:		connection B to lock or NULL
+ *
+ * Lock two connections at once. As we need to have a stable locking order, we
+ * always lock the connection with lower memory address first.
+ */
+static inline void kdbus_conn_lock2(struct kdbus_conn *a, struct kdbus_conn *b)
+{
+	if (a < b) {
+		if (a)
+			mutex_lock(&a->lock);
+		if (b && b != a)
+			mutex_lock_nested(&b->lock, !!a);
+	} else {
+		if (b)
+			mutex_lock(&b->lock);
+		if (a && a != b)
+			mutex_lock_nested(&a->lock, !!b);
+	}
+}
+
+/**
+ * kdbus_conn_unlock2() - Unlock two connections
+ * @a:		connection A to unlock or NULL
+ * @b:		connection B to unlock or NULL
+ *
+ * Unlock two connections at once. See kdbus_conn_lock2().
+ */
+static inline void kdbus_conn_unlock2(struct kdbus_conn *a,
+				      struct kdbus_conn *b)
+{
+	if (a)
+		mutex_unlock(&a->lock);
+	if (b && b != a)
+		mutex_unlock(&b->lock);
+}
+
+/**
+ * kdbus_conn_assert_active() - lockdep assert on active lock
+ * @conn:	connection that shall be active
+ *
+ * This verifies via lockdep that the caller holds an active reference to the
+ * given connection.
+ */
+static inline void kdbus_conn_assert_active(struct kdbus_conn *conn)
+{
+	lockdep_assert_held(conn);
+}
+
+#endif
diff --git a/ipc/kdbus/domain.c b/ipc/kdbus/domain.c
new file mode 100644
index 000000000..ac9f760c1
--- /dev/null
+++ b/ipc/kdbus/domain.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "bus.h"
+#include "domain.h"
+#include "handle.h"
+#include "item.h"
+#include "limits.h"
+#include "util.h"
+
+static void kdbus_domain_control_free(struct kdbus_node *node)
+{
+	kfree(node);
+}
+
+static struct kdbus_node *kdbus_domain_control_new(struct kdbus_domain *domain,
+						   unsigned int access)
+{
+	struct kdbus_node *node;
+	int ret;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	kdbus_node_init(node, KDBUS_NODE_CONTROL);
+
+	node->free_cb = kdbus_domain_control_free;
+	node->mode = domain->node.mode;
+	node->mode = S_IRUSR | S_IWUSR;
+	if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD))
+		node->mode |= S_IRGRP | S_IWGRP;
+	if (access & KDBUS_MAKE_ACCESS_WORLD)
+		node->mode |= S_IROTH | S_IWOTH;
+
+	ret = kdbus_node_link(node, &domain->node, "control");
+	if (ret < 0)
+		goto exit_free;
+
+	return node;
+
+exit_free:
+	kdbus_node_deactivate(node);
+	kdbus_node_unref(node);
+	return ERR_PTR(ret);
+}
+
+static void kdbus_domain_free(struct kdbus_node *node)
+{
+	struct kdbus_domain *domain =
+		container_of(node, struct kdbus_domain, node);
+
+	put_user_ns(domain->user_namespace);
+	ida_destroy(&domain->user_ida);
+	idr_destroy(&domain->user_idr);
+	kfree(domain);
+}
+
+/**
+ * kdbus_domain_new() - create a new domain
+ * @access:		The access mode for this node (KDBUS_MAKE_ACCESS_*)
+ *
+ * Return: a new kdbus_domain on success, ERR_PTR on failure
+ */
+struct kdbus_domain *kdbus_domain_new(unsigned int access)
+{
+	struct kdbus_domain *d;
+	int ret;
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return ERR_PTR(-ENOMEM);
+
+	kdbus_node_init(&d->node, KDBUS_NODE_DOMAIN);
+
+	d->node.free_cb = kdbus_domain_free;
+	d->node.mode = S_IRUSR | S_IXUSR;
+	if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD))
+		d->node.mode |= S_IRGRP | S_IXGRP;
+	if (access & KDBUS_MAKE_ACCESS_WORLD)
+		d->node.mode |= S_IROTH | S_IXOTH;
+
+	mutex_init(&d->lock);
+	idr_init(&d->user_idr);
+	ida_init(&d->user_ida);
+
+	/* Pin user namespace so we can guarantee domain-unique bus * names. */
+	d->user_namespace = get_user_ns(current_user_ns());
+
+	ret = kdbus_node_link(&d->node, NULL, NULL);
+	if (ret < 0)
+		goto exit_unref;
+
+	return d;
+
+exit_unref:
+	kdbus_node_deactivate(&d->node);
+	kdbus_node_unref(&d->node);
+	return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_domain_ref() - take a domain reference
+ * @domain:		Domain
+ *
+ * Return: the domain itself
+ */
+struct kdbus_domain *kdbus_domain_ref(struct kdbus_domain *domain)
+{
+	if (domain)
+		kdbus_node_ref(&domain->node);
+	return domain;
+}
+
+/**
+ * kdbus_domain_unref() - drop a domain reference
+ * @domain:		Domain
+ *
+ * When the last reference is dropped, the domain internal structure
+ * is freed.
+ *
+ * Return: NULL
+ */
+struct kdbus_domain *kdbus_domain_unref(struct kdbus_domain *domain)
+{
+	if (domain)
+		kdbus_node_unref(&domain->node);
+	return NULL;
+}
+
+/**
+ * kdbus_domain_populate() - populate static domain nodes
+ * @domain:	domain to populate
+ * @access:	KDBUS_MAKE_ACCESS_* access restrictions for new nodes
+ *
+ * Allocate and activate static sub-nodes of the given domain. This will fail if
+ * you call it on a non-active node or if the domain was already populated.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_domain_populate(struct kdbus_domain *domain, unsigned int access)
+{
+	struct kdbus_node *control;
+
+	/*
+	 * Create a control-node for this domain. We drop our own reference
+	 * immediately, effectively causing the node to be deactivated and
+	 * released when the parent domain is.
+	 */
+	control = kdbus_domain_control_new(domain, access);
+	if (IS_ERR(control))
+		return PTR_ERR(control);
+
+	kdbus_node_activate(control);
+	kdbus_node_unref(control);
+	return 0;
+}
+
+/**
+ * kdbus_user_lookup() - lookup a kdbus_user object
+ * @domain:		domain of the user
+ * @uid:		uid of the user; INVALID_UID for an anon user
+ *
+ * Lookup the kdbus user accounting object for the given domain. If INVALID_UID
+ * is passed, a new anonymous user is created which is private to the caller.
+ *
+ * Return: The user object is returned, ERR_PTR on failure.
+ */
+struct kdbus_user *kdbus_user_lookup(struct kdbus_domain *domain, kuid_t uid)
+{
+	struct kdbus_user *u = NULL, *old = NULL;
+	int ret;
+
+	mutex_lock(&domain->lock);
+
+	if (uid_valid(uid)) {
+		old = idr_find(&domain->user_idr, __kuid_val(uid));
+		/*
+		 * If the object is about to be destroyed, ignore it and
+		 * replace the slot in the IDR later on.
+		 */
+		if (old && kref_get_unless_zero(&old->kref)) {
+			mutex_unlock(&domain->lock);
+			return old;
+		}
+	}
+
+	u = kzalloc(sizeof(*u), GFP_KERNEL);
+	if (!u) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	kref_init(&u->kref);
+	u->domain = kdbus_domain_ref(domain);
+	u->uid = uid;
+	atomic_set(&u->buses, 0);
+	atomic_set(&u->connections, 0);
+
+	if (uid_valid(uid)) {
+		if (old) {
+			idr_replace(&domain->user_idr, u, __kuid_val(uid));
+			old->uid = INVALID_UID; /* mark old as removed */
+		} else {
+			ret = idr_alloc(&domain->user_idr, u, __kuid_val(uid),
+					__kuid_val(uid) + 1, GFP_KERNEL);
+			if (ret < 0)
+				goto exit;
+		}
+	}
+
+	/*
+	 * Allocate the smallest possible index for this user; used
+	 * in arrays for accounting user quota in receiver queues.
+	 */
+	ret = ida_simple_get(&domain->user_ida, 1, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto exit;
+
+	u->id = ret;
+	mutex_unlock(&domain->lock);
+	return u;
+
+exit:
+	if (u) {
+		if (uid_valid(u->uid))
+			idr_remove(&domain->user_idr, __kuid_val(u->uid));
+		kdbus_domain_unref(u->domain);
+		kfree(u);
+	}
+	mutex_unlock(&domain->lock);
+	return ERR_PTR(ret);
+}
+
+static void __kdbus_user_free(struct kref *kref)
+{
+	struct kdbus_user *user = container_of(kref, struct kdbus_user, kref);
+
+	WARN_ON(atomic_read(&user->buses) > 0);
+	WARN_ON(atomic_read(&user->connections) > 0);
+
+	mutex_lock(&user->domain->lock);
+	ida_simple_remove(&user->domain->user_ida, user->id);
+	if (uid_valid(user->uid))
+		idr_remove(&user->domain->user_idr, __kuid_val(user->uid));
+	mutex_unlock(&user->domain->lock);
+
+	kdbus_domain_unref(user->domain);
+	kfree(user);
+}
+
+/**
+ * kdbus_user_ref() - take a user reference
+ * @u:		User
+ *
+ * Return: @u is returned
+ */
+struct kdbus_user *kdbus_user_ref(struct kdbus_user *u)
+{
+	if (u)
+		kref_get(&u->kref);
+	return u;
+}
+
+/**
+ * kdbus_user_unref() - drop a user reference
+ * @u:		User
+ *
+ * Return: NULL
+ */
+struct kdbus_user *kdbus_user_unref(struct kdbus_user *u)
+{
+	if (u)
+		kref_put(&u->kref, __kdbus_user_free);
+	return NULL;
+}
diff --git a/ipc/kdbus/domain.h b/ipc/kdbus/domain.h
new file mode 100644
index 000000000..447a2bd4d
--- /dev/null
+++ b/ipc/kdbus/domain.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_DOMAIN_H
+#define __KDBUS_DOMAIN_H
+
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/user_namespace.h>
+
+#include "node.h"
+
+/**
+ * struct kdbus_domain - domain for buses
+ * @node:		Underlying API node
+ * @lock:		Domain data lock
+ * @last_id:		Last used object id
+ * @user_idr:		Set of all users indexed by UID
+ * @user_ida:		Set of all users to compute small indices
+ * @user_namespace:	User namespace, pinned at creation time
+ * @dentry:		Root dentry of VFS mount (don't use outside of kdbusfs)
+ */
+struct kdbus_domain {
+	struct kdbus_node node;
+	struct mutex lock;
+	atomic64_t last_id;
+	struct idr user_idr;
+	struct ida user_ida;
+	struct user_namespace *user_namespace;
+	struct dentry *dentry;
+};
+
+/**
+ * struct kdbus_user - resource accounting for users
+ * @kref:		Reference counter
+ * @domain:		Domain of the user
+ * @id:			Index of this user
+ * @uid:		UID of the user
+ * @buses:		Number of buses the user has created
+ * @connections:	Number of connections the user has created
+ */
+struct kdbus_user {
+	struct kref kref;
+	struct kdbus_domain *domain;
+	unsigned int id;
+	kuid_t uid;
+	atomic_t buses;
+	atomic_t connections;
+};
+
+#define kdbus_domain_from_node(_node) \
+	container_of((_node), struct kdbus_domain, node)
+
+struct kdbus_domain *kdbus_domain_new(unsigned int access);
+struct kdbus_domain *kdbus_domain_ref(struct kdbus_domain *domain);
+struct kdbus_domain *kdbus_domain_unref(struct kdbus_domain *domain);
+int kdbus_domain_populate(struct kdbus_domain *domain, unsigned int access);
+
+#define KDBUS_USER_KERNEL_ID 0 /* ID 0 is reserved for kernel accounting */
+
+struct kdbus_user *kdbus_user_lookup(struct kdbus_domain *domain, kuid_t uid);
+struct kdbus_user *kdbus_user_ref(struct kdbus_user *u);
+struct kdbus_user *kdbus_user_unref(struct kdbus_user *u);
+
+#endif
diff --git a/ipc/kdbus/endpoint.c b/ipc/kdbus/endpoint.c
new file mode 100644
index 000000000..977964dbb
--- /dev/null
+++ b/ipc/kdbus/endpoint.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "message.h"
+#include "policy.h"
+
+static void kdbus_ep_free(struct kdbus_node *node)
+{
+	struct kdbus_ep *ep = container_of(node, struct kdbus_ep, node);
+
+	WARN_ON(!list_empty(&ep->conn_list));
+
+	kdbus_policy_db_clear(&ep->policy_db);
+	kdbus_bus_unref(ep->bus);
+	kdbus_user_unref(ep->user);
+	kfree(ep);
+}
+
+static void kdbus_ep_release(struct kdbus_node *node, bool was_active)
+{
+	struct kdbus_ep *ep = container_of(node, struct kdbus_ep, node);
+
+	/* disconnect all connections to this endpoint */
+	for (;;) {
+		struct kdbus_conn *conn;
+
+		mutex_lock(&ep->lock);
+		conn = list_first_entry_or_null(&ep->conn_list,
+						struct kdbus_conn,
+						ep_entry);
+		if (!conn) {
+			mutex_unlock(&ep->lock);
+			break;
+		}
+
+		/* take reference, release lock, disconnect without lock */
+		kdbus_conn_ref(conn);
+		mutex_unlock(&ep->lock);
+
+		kdbus_conn_disconnect(conn, false);
+		kdbus_conn_unref(conn);
+	}
+}
+
+/**
+ * kdbus_ep_new() - create a new endpoint
+ * @bus:		The bus this endpoint will be created for
+ * @name:		The name of the endpoint
+ * @access:		The access flags for this node (KDBUS_MAKE_ACCESS_*)
+ * @uid:		The uid of the node
+ * @gid:		The gid of the node
+ * @is_custom:		Whether this is a custom endpoint
+ *
+ * This function will create a new endpoint with the given
+ * name and properties for a given bus.
+ *
+ * Return: a new kdbus_ep on success, ERR_PTR on failure.
+ */
+struct kdbus_ep *kdbus_ep_new(struct kdbus_bus *bus, const char *name,
+			      unsigned int access, kuid_t uid, kgid_t gid,
+			      bool is_custom)
+{
+	struct kdbus_ep *e;
+	int ret;
+
+	/*
+	 * Validate only custom endpoints names, default endpoints
+	 * with a "bus" name are created when the bus is created
+	 */
+	if (is_custom) {
+		ret = kdbus_verify_uid_prefix(name, bus->domain->user_namespace,
+					      uid);
+		if (ret < 0)
+			return ERR_PTR(ret);
+	}
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (!e)
+		return ERR_PTR(-ENOMEM);
+
+	kdbus_node_init(&e->node, KDBUS_NODE_ENDPOINT);
+
+	e->node.free_cb = kdbus_ep_free;
+	e->node.release_cb = kdbus_ep_release;
+	e->node.uid = uid;
+	e->node.gid = gid;
+	e->node.mode = S_IRUSR | S_IWUSR;
+	if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD))
+		e->node.mode |= S_IRGRP | S_IWGRP;
+	if (access & KDBUS_MAKE_ACCESS_WORLD)
+		e->node.mode |= S_IROTH | S_IWOTH;
+
+	mutex_init(&e->lock);
+	INIT_LIST_HEAD(&e->conn_list);
+	kdbus_policy_db_init(&e->policy_db);
+	e->bus = kdbus_bus_ref(bus);
+
+	ret = kdbus_node_link(&e->node, &bus->node, name);
+	if (ret < 0)
+		goto exit_unref;
+
+	/*
+	 * Transactions on custom endpoints are never accounted on the global
+	 * user limits. Instead, for each custom endpoint, we create a custom,
+	 * unique user, which all transactions are accounted on. Regardless of
+	 * the user using that endpoint, it is always accounted on the same
+	 * user-object. This budget is not shared with ordinary users on
+	 * non-custom endpoints.
+	 */
+	if (is_custom) {
+		e->user = kdbus_user_lookup(bus->domain, INVALID_UID);
+		if (IS_ERR(e->user)) {
+			ret = PTR_ERR(e->user);
+			e->user = NULL;
+			goto exit_unref;
+		}
+	}
+
+	return e;
+
+exit_unref:
+	kdbus_node_deactivate(&e->node);
+	kdbus_node_unref(&e->node);
+	return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_ep_ref() - increase the reference counter of a kdbus_ep
+ * @ep:			The endpoint to reference
+ *
+ * Every user of an endpoint, except for its creator, must add a reference to
+ * the kdbus_ep instance using this function.
+ *
+ * Return: the ep itself
+ */
+struct kdbus_ep *kdbus_ep_ref(struct kdbus_ep *ep)
+{
+	if (ep)
+		kdbus_node_ref(&ep->node);
+	return ep;
+}
+
+/**
+ * kdbus_ep_unref() - decrease the reference counter of a kdbus_ep
+ * @ep:		The ep to unref
+ *
+ * Release a reference. If the reference count drops to 0, the ep will be
+ * freed.
+ *
+ * Return: NULL
+ */
+struct kdbus_ep *kdbus_ep_unref(struct kdbus_ep *ep)
+{
+	if (ep)
+		kdbus_node_unref(&ep->node);
+	return NULL;
+}
+
+/**
+ * kdbus_cmd_ep_make() - handle KDBUS_CMD_ENDPOINT_MAKE
+ * @bus:		bus to operate on
+ * @argp:		command payload
+ *
+ * Return: NULL or newly created endpoint on success, ERR_PTR on failure.
+ */
+struct kdbus_ep *kdbus_cmd_ep_make(struct kdbus_bus *bus, void __user *argp)
+{
+	const char *item_make_name;
+	struct kdbus_ep *ep = NULL;
+	struct kdbus_cmd *cmd;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_MAKE_NAME, .mandatory = true },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE |
+				 KDBUS_MAKE_ACCESS_GROUP |
+				 KDBUS_MAKE_ACCESS_WORLD,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+
+	item_make_name = argv[1].item->str;
+
+	ep = kdbus_ep_new(bus, item_make_name, cmd->flags,
+			  current_euid(), current_egid(), true);
+	if (IS_ERR(ep)) {
+		ret = PTR_ERR(ep);
+		ep = NULL;
+		goto exit;
+	}
+
+	if (!kdbus_node_activate(&ep->node)) {
+		ret = -ESHUTDOWN;
+		goto exit;
+	}
+
+exit:
+	ret = kdbus_args_clear(&args, ret);
+	if (ret < 0) {
+		if (ep) {
+			kdbus_node_deactivate(&ep->node);
+			kdbus_ep_unref(ep);
+		}
+		return ERR_PTR(ret);
+	}
+	return ep;
+}
+
+/**
+ * kdbus_cmd_ep_update() - handle KDBUS_CMD_ENDPOINT_UPDATE
+ * @ep:			endpoint to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_ep_update(struct kdbus_ep *ep, void __user *argp)
+{
+	struct kdbus_cmd *cmd;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_NAME, .multiple = true },
+		{ .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	ret = kdbus_policy_set(&ep->policy_db, args.items, args.items_size,
+			       0, true, ep);
+	return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/endpoint.h b/ipc/kdbus/endpoint.h
new file mode 100644
index 000000000..bc1b94a70
--- /dev/null
+++ b/ipc/kdbus/endpoint.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_ENDPOINT_H
+#define __KDBUS_ENDPOINT_H
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/uidgid.h>
+#include "node.h"
+#include "policy.h"
+
+struct kdbus_bus;
+struct kdbus_user;
+
+/**
+ * struct kdbus_ep - endpoint to access a bus
+ * @node:		The kdbus node
+ * @lock:		Endpoint data lock
+ * @bus:		Bus behind this endpoint
+ * @user:		Custom enpoints account against an anonymous user
+ * @policy_db:		Uploaded policy
+ * @conn_list:		Connections of this endpoint
+ *
+ * An endpoint offers access to a bus; the default endpoint node name is "bus".
+ * Additional custom endpoints to the same bus can be created and they can
+ * carry their own policies/filters.
+ */
+struct kdbus_ep {
+	struct kdbus_node node;
+	struct mutex lock;
+
+	/* static */
+	struct kdbus_bus *bus;
+	struct kdbus_user *user;
+
+	/* protected by own locks */
+	struct kdbus_policy_db policy_db;
+
+	/* protected by ep->lock */
+	struct list_head conn_list;
+};
+
+#define kdbus_ep_from_node(_node) \
+	container_of((_node), struct kdbus_ep, node)
+
+struct kdbus_ep *kdbus_ep_new(struct kdbus_bus *bus, const char *name,
+			      unsigned int access, kuid_t uid, kgid_t gid,
+			      bool policy);
+struct kdbus_ep *kdbus_ep_ref(struct kdbus_ep *ep);
+struct kdbus_ep *kdbus_ep_unref(struct kdbus_ep *ep);
+
+struct kdbus_ep *kdbus_cmd_ep_make(struct kdbus_bus *bus, void __user *argp);
+int kdbus_cmd_ep_update(struct kdbus_ep *ep, void __user *argp);
+
+#endif
diff --git a/ipc/kdbus/fs.c b/ipc/kdbus/fs.c
new file mode 100644
index 000000000..09c480924
--- /dev/null
+++ b/ipc/kdbus/fs.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/init.h>
+#include <linux/ipc_namespace.h>
+#include <linux/magic.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "bus.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "fs.h"
+#include "handle.h"
+#include "node.h"
+
+#define kdbus_node_from_dentry(_dentry) \
+	((struct kdbus_node *)(_dentry)->d_fsdata)
+
+static struct inode *fs_inode_get(struct super_block *sb,
+				  struct kdbus_node *node);
+
+/*
+ * Directory Management
+ */
+
+static inline unsigned char kdbus_dt_type(struct kdbus_node *node)
+{
+	switch (node->type) {
+	case KDBUS_NODE_DOMAIN:
+	case KDBUS_NODE_BUS:
+		return DT_DIR;
+	case KDBUS_NODE_CONTROL:
+	case KDBUS_NODE_ENDPOINT:
+		return DT_REG;
+	}
+
+	return DT_UNKNOWN;
+}
+
+static int fs_dir_fop_iterate(struct file *file, struct dir_context *ctx)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct kdbus_node *parent = kdbus_node_from_dentry(dentry);
+	struct kdbus_node *old, *next = file->private_data;
+
+	/*
+	 * kdbusfs directory iterator (modelled after sysfs/kernfs)
+	 * When iterating kdbusfs directories, we iterate all children of the
+	 * parent kdbus_node object. We use ctx->pos to store the hash of the
+	 * child and file->private_data to store a reference to the next node
+	 * object. If ctx->pos is not modified via llseek while you iterate a
+	 * directory, then we use the file->private_data node pointer to
+	 * directly access the next node in the tree.
+	 * However, if you directly seek on the directory, we have to find the
+	 * closest node to that position and cannot use our node pointer. This
+	 * means iterating the rb-tree to find the closest match and start over
+	 * from there.
+	 * Note that hash values are not necessarily unique. Therefore, llseek
+	 * is not guaranteed to seek to the same node that you got when you
+	 * retrieved the position. Seeking to 0, 1, 2 and >=INT_MAX is safe,
+	 * though. We could use the inode-number as position, but this would
+	 * require another rb-tree for fast access. Kernfs and others already
+	 * ignore those conflicts, so we should be fine, too.
+	 */
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	/* acquire @next; if deactivated, or seek detected, find next node */
+	old = next;
+	if (next && ctx->pos == next->hash) {
+		if (kdbus_node_acquire(next))
+			kdbus_node_ref(next);
+		else
+			next = kdbus_node_next_child(parent, next);
+	} else {
+		next = kdbus_node_find_closest(parent, ctx->pos);
+	}
+	kdbus_node_unref(old);
+
+	while (next) {
+		/* emit @next */
+		file->private_data = next;
+		ctx->pos = next->hash;
+
+		kdbus_node_release(next);
+
+		if (!dir_emit(ctx, next->name, strlen(next->name), next->id,
+			      kdbus_dt_type(next)))
+			return 0;
+
+		/* find next node after @next */
+		old = next;
+		next = kdbus_node_next_child(parent, next);
+		kdbus_node_unref(old);
+	}
+
+	file->private_data = NULL;
+	ctx->pos = INT_MAX;
+
+	return 0;
+}
+
+static loff_t fs_dir_fop_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file_inode(file);
+	loff_t ret;
+
+	/* protect f_off against fop_iterate */
+	mutex_lock(&inode->i_mutex);
+	ret = generic_file_llseek(file, offset, whence);
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+static int fs_dir_fop_release(struct inode *inode, struct file *file)
+{
+	kdbus_node_unref(file->private_data);
+	return 0;
+}
+
+static const struct file_operations fs_dir_fops = {
+	.read		= generic_read_dir,
+	.iterate	= fs_dir_fop_iterate,
+	.llseek		= fs_dir_fop_llseek,
+	.release	= fs_dir_fop_release,
+};
+
+static struct dentry *fs_dir_iop_lookup(struct inode *dir,
+					struct dentry *dentry,
+					unsigned int flags)
+{
+	struct dentry *dnew = NULL;
+	struct kdbus_node *parent;
+	struct kdbus_node *node;
+	struct inode *inode;
+
+	parent = kdbus_node_from_dentry(dentry->d_parent);
+	if (!kdbus_node_acquire(parent))
+		return NULL;
+
+	/* returns reference to _acquired_ child node */
+	node = kdbus_node_find_child(parent, dentry->d_name.name);
+	if (node) {
+		dentry->d_fsdata = node;
+		inode = fs_inode_get(dir->i_sb, node);
+		if (IS_ERR(inode))
+			dnew = ERR_CAST(inode);
+		else
+			dnew = d_splice_alias(inode, dentry);
+
+		kdbus_node_release(node);
+	}
+
+	kdbus_node_release(parent);
+	return dnew;
+}
+
+static const struct inode_operations fs_dir_iops = {
+	.permission	= generic_permission,
+	.lookup		= fs_dir_iop_lookup,
+};
+
+/*
+ * Inode Management
+ */
+
+static const struct inode_operations fs_inode_iops = {
+	.permission	= generic_permission,
+};
+
+static struct inode *fs_inode_get(struct super_block *sb,
+				  struct kdbus_node *node)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, node->id);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	inode->i_private = kdbus_node_ref(node);
+	inode->i_mapping->a_ops = &empty_aops;
+	inode->i_mode = node->mode & S_IALLUGO;
+	inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	inode->i_uid = node->uid;
+	inode->i_gid = node->gid;
+
+	switch (node->type) {
+	case KDBUS_NODE_DOMAIN:
+	case KDBUS_NODE_BUS:
+		inode->i_mode |= S_IFDIR;
+		inode->i_op = &fs_dir_iops;
+		inode->i_fop = &fs_dir_fops;
+		set_nlink(inode, 2);
+		break;
+	case KDBUS_NODE_CONTROL:
+	case KDBUS_NODE_ENDPOINT:
+		inode->i_mode |= S_IFREG;
+		inode->i_op = &fs_inode_iops;
+		inode->i_fop = &kdbus_handle_ops;
+		break;
+	}
+
+	unlock_new_inode(inode);
+
+	return inode;
+}
+
+/*
+ * Superblock Management
+ */
+
+static int fs_super_dop_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct kdbus_node *node;
+
+	/* Force lookup on negatives */
+	if (!dentry->d_inode)
+		return 0;
+
+	node = kdbus_node_from_dentry(dentry);
+
+	/* see whether the node has been removed */
+	if (!kdbus_node_is_active(node))
+		return 0;
+
+	return 1;
+}
+
+static void fs_super_dop_release(struct dentry *dentry)
+{
+	kdbus_node_unref(dentry->d_fsdata);
+}
+
+static const struct dentry_operations fs_super_dops = {
+	.d_revalidate	= fs_super_dop_revalidate,
+	.d_release	= fs_super_dop_release,
+};
+
+static void fs_super_sop_evict_inode(struct inode *inode)
+{
+	struct kdbus_node *node = kdbus_node_from_inode(inode);
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+	kdbus_node_unref(node);
+}
+
+static const struct super_operations fs_super_sops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= fs_super_sop_evict_inode,
+};
+
+static int fs_super_fill(struct super_block *sb)
+{
+	struct kdbus_domain *domain = sb->s_fs_info;
+	struct inode *inode;
+	int ret;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = KDBUS_SUPER_MAGIC;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_op = &fs_super_sops;
+	sb->s_time_gran = 1;
+
+	inode = fs_inode_get(sb, &domain->node);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root) {
+		/* d_make_root iput()s the inode on failure */
+		return -ENOMEM;
+	}
+
+	/* sb holds domain reference */
+	sb->s_root->d_fsdata = &domain->node;
+	sb->s_d_op = &fs_super_dops;
+
+	/* sb holds root reference */
+	domain->dentry = sb->s_root;
+
+	if (!kdbus_node_activate(&domain->node))
+		return -ESHUTDOWN;
+
+	ret = kdbus_domain_populate(domain, KDBUS_MAKE_ACCESS_WORLD);
+	if (ret < 0)
+		return ret;
+
+	sb->s_flags |= MS_ACTIVE;
+	return 0;
+}
+
+static void fs_super_kill(struct super_block *sb)
+{
+	struct kdbus_domain *domain = sb->s_fs_info;
+
+	if (domain) {
+		kdbus_node_deactivate(&domain->node);
+		domain->dentry = NULL;
+	}
+
+	kill_anon_super(sb);
+	kdbus_domain_unref(domain);
+}
+
+static int fs_super_set(struct super_block *sb, void *data)
+{
+	int ret;
+
+	ret = set_anon_super(sb, data);
+	if (!ret)
+		sb->s_fs_info = data;
+
+	return ret;
+}
+
+static struct dentry *fs_super_mount(struct file_system_type *fs_type,
+				     int flags, const char *dev_name,
+				     void *data)
+{
+	struct kdbus_domain *domain;
+	struct super_block *sb;
+	int ret;
+
+	domain = kdbus_domain_new(KDBUS_MAKE_ACCESS_WORLD);
+	if (IS_ERR(domain))
+		return ERR_CAST(domain);
+
+	sb = sget(fs_type, NULL, fs_super_set, flags, domain);
+	if (IS_ERR(sb)) {
+		kdbus_node_deactivate(&domain->node);
+		kdbus_domain_unref(domain);
+		return ERR_CAST(sb);
+	}
+
+	WARN_ON(sb->s_fs_info != domain);
+	WARN_ON(sb->s_root);
+
+	ret = fs_super_fill(sb);
+	if (ret < 0) {
+		/* calls into ->kill_sb() when done */
+		deactivate_locked_super(sb);
+		return ERR_PTR(ret);
+	}
+
+	return dget(sb->s_root);
+}
+
+static struct file_system_type fs_type = {
+	.name		= KBUILD_MODNAME "fs",
+	.owner		= THIS_MODULE,
+	.mount		= fs_super_mount,
+	.kill_sb	= fs_super_kill,
+	.fs_flags	= FS_USERNS_MOUNT,
+};
+
+/**
+ * kdbus_fs_init() - register kdbus filesystem
+ *
+ * This registers a filesystem with the VFS layer. The filesystem is called
+ * `KBUILD_MODNAME "fs"', which usually resolves to `kdbusfs'. The nameing
+ * scheme allows to set KBUILD_MODNAME to "kdbus2" and you will get an
+ * independent filesystem for developers.
+ *
+ * Each mount of the kdbusfs filesystem has an kdbus_domain attached.
+ * Operations on this mount will only affect the attached domain. On each mount
+ * a new domain is automatically created and used for this mount exclusively.
+ * If you want to share a domain across multiple mounts, you need to bind-mount
+ * it.
+ *
+ * Mounts of kdbusfs (with a different domain each) are unrelated to each other
+ * and will never have any effect on any domain but their own.
+ *
+ * Return: 0 on success, negative error otherwise.
+ */
+int kdbus_fs_init(void)
+{
+	return register_filesystem(&fs_type);
+}
+
+/**
+ * kdbus_fs_exit() - unregister kdbus filesystem
+ *
+ * This does the reverse to kdbus_fs_init(). It unregisters the kdbusfs
+ * filesystem from VFS and cleans up any allocated resources.
+ */
+void kdbus_fs_exit(void)
+{
+	unregister_filesystem(&fs_type);
+}
+
+/* acquire domain of @node, making sure all ancestors are active */
+static struct kdbus_domain *fs_acquire_domain(struct kdbus_node *node)
+{
+	struct kdbus_domain *domain;
+	struct kdbus_node *iter;
+
+	/* caller must guarantee that @node is linked */
+	for (iter = node; iter->parent; iter = iter->parent)
+		if (!kdbus_node_is_active(iter->parent))
+			return NULL;
+
+	/* root nodes are always domains */
+	if (WARN_ON(iter->type != KDBUS_NODE_DOMAIN))
+		return NULL;
+
+	domain = kdbus_domain_from_node(iter);
+	if (!kdbus_node_acquire(&domain->node))
+		return NULL;
+
+	return domain;
+}
+
+/**
+ * kdbus_fs_flush() - flush dcache entries of a node
+ * @node:		Node to flush entries of
+ *
+ * This flushes all VFS filesystem cache entries for a node and all its
+ * children. This should be called whenever a node is destroyed during
+ * runtime. It will flush the cache entries so the linked objects can be
+ * deallocated.
+ *
+ * This is a no-op if you call it on active nodes (they really should stay in
+ * cache) or on nodes with deactivated parents (flushing the parent is enough).
+ * Furthermore, there is no need to call it on nodes whose lifetime is bound to
+ * their parents'. In those cases, the parent-flush will always also flush the
+ * children.
+ */
+void kdbus_fs_flush(struct kdbus_node *node)
+{
+	struct dentry *dentry, *parent_dentry = NULL;
+	struct kdbus_domain *domain;
+	struct qstr name;
+
+	/* active nodes should remain in cache */
+	if (!kdbus_node_is_deactivated(node))
+		return;
+
+	/* nodes that were never linked were never instantiated */
+	if (!node->parent)
+		return;
+
+	/* acquire domain and verify all ancestors are active */
+	domain = fs_acquire_domain(node);
+	if (!domain)
+		return;
+
+	switch (node->type) {
+	case KDBUS_NODE_ENDPOINT:
+		if (WARN_ON(!node->parent || !node->parent->name))
+			goto exit;
+
+		name.name = node->parent->name;
+		name.len = strlen(node->parent->name);
+		parent_dentry = d_hash_and_lookup(domain->dentry, &name);
+		if (IS_ERR_OR_NULL(parent_dentry))
+			goto exit;
+
+		/* fallthrough */
+	case KDBUS_NODE_BUS:
+		if (WARN_ON(!node->name))
+			goto exit;
+
+		name.name = node->name;
+		name.len = strlen(node->name);
+		dentry = d_hash_and_lookup(parent_dentry ? : domain->dentry,
+					   &name);
+		if (!IS_ERR_OR_NULL(dentry)) {
+			d_invalidate(dentry);
+			dput(dentry);
+		}
+
+		dput(parent_dentry);
+		break;
+
+	default:
+		/* all other types are bound to their parent lifetime */
+		break;
+	}
+
+exit:
+	kdbus_node_release(&domain->node);
+}
diff --git a/ipc/kdbus/fs.h b/ipc/kdbus/fs.h
new file mode 100644
index 000000000..62f7d6abf
--- /dev/null
+++ b/ipc/kdbus/fs.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUSFS_H
+#define __KDBUSFS_H
+
+#include <linux/kernel.h>
+
+struct kdbus_node;
+
+int kdbus_fs_init(void);
+void kdbus_fs_exit(void);
+void kdbus_fs_flush(struct kdbus_node *node);
+
+#define kdbus_node_from_inode(_inode) \
+	((struct kdbus_node *)(_inode)->i_private)
+
+#endif
diff --git a/ipc/kdbus/handle.c b/ipc/kdbus/handle.c
new file mode 100644
index 000000000..e0e06b0e1
--- /dev/null
+++ b/ipc/kdbus/handle.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "fs.h"
+#include "handle.h"
+#include "item.h"
+#include "match.h"
+#include "message.h"
+#include "names.h"
+#include "domain.h"
+#include "policy.h"
+
+static int kdbus_args_verify(struct kdbus_args *args)
+{
+	struct kdbus_item *item;
+	size_t i;
+	int ret;
+
+	KDBUS_ITEMS_FOREACH(item, args->items, args->items_size) {
+		struct kdbus_arg *arg = NULL;
+
+		if (!KDBUS_ITEM_VALID(item, args->items, args->items_size))
+			return -EINVAL;
+
+		for (i = 0; i < args->argc; ++i)
+			if (args->argv[i].type == item->type)
+				break;
+		if (i >= args->argc)
+			return -EINVAL;
+
+		arg = &args->argv[i];
+
+		ret = kdbus_item_validate(item);
+		if (ret < 0)
+			return ret;
+
+		if (arg->item && !arg->multiple)
+			return -EINVAL;
+
+		arg->item = item;
+	}
+
+	if (!KDBUS_ITEMS_END(item, args->items, args->items_size))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int kdbus_args_negotiate(struct kdbus_args *args)
+{
+	struct kdbus_item __user *user;
+	struct kdbus_item *negotiation;
+	size_t i, j, num;
+
+	/*
+	 * If KDBUS_FLAG_NEGOTIATE is set, we overwrite the flags field with
+	 * the set of supported flags. Furthermore, if an KDBUS_ITEM_NEGOTIATE
+	 * item is passed, we iterate its payload (array of u64, each set to an
+	 * item type) and clear all unsupported item-types to 0.
+	 * The caller might do this recursively, if other flags or objects are
+	 * embedded in the payload itself.
+	 */
+
+	if (args->cmd->flags & KDBUS_FLAG_NEGOTIATE) {
+		if (put_user(args->allowed_flags & ~KDBUS_FLAG_NEGOTIATE,
+			     &args->user->flags))
+			return -EFAULT;
+	}
+
+	if (args->argc < 1 || args->argv[0].type != KDBUS_ITEM_NEGOTIATE ||
+	    !args->argv[0].item)
+		return 0;
+
+	negotiation = args->argv[0].item;
+	user = (struct kdbus_item __user *)
+		((u8 __user *)args->user +
+		 ((u8 *)negotiation - (u8 *)args->cmd));
+	num = KDBUS_ITEM_PAYLOAD_SIZE(negotiation) / sizeof(u64);
+
+	for (i = 0; i < num; ++i) {
+		for (j = 0; j < args->argc; ++j)
+			if (negotiation->data64[i] == args->argv[j].type)
+				break;
+
+		if (j < args->argc)
+			continue;
+
+		/* this item is not supported, clear it out */
+		negotiation->data64[i] = 0;
+		if (put_user(negotiation->data64[i], &user->data64[i]))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * __kdbus_args_parse() - parse payload of kdbus command
+ * @args:		object to parse data into
+ * @is_cmd:		whether this is a command or msg payload
+ * @argp:		user-space location of command payload to parse
+ * @type_size:		overall size of command payload to parse
+ * @items_offset:	offset of items array in command payload
+ * @out:		output variable to store pointer to copied payload
+ *
+ * This parses the ioctl payload at user-space location @argp into @args. @args
+ * must be pre-initialized by the caller to reflect the supported flags and
+ * items of this command. This parser will then copy the command payload into
+ * kernel-space, verify correctness and consistency and cache pointers to parsed
+ * items and other data in @args.
+ *
+ * If this function succeeded, you must call kdbus_args_clear() to release
+ * allocated resources before destroying @args.
+ *
+ * This can also be used to import kdbus_msg objects. In that case, @is_cmd must
+ * be set to 'false' and the 'return_flags' field will not be touched (as it
+ * doesn't exist on kdbus_msg).
+ *
+ * Return: On failure a negative error code is returned. Otherwise, 1 is
+ * returned if negotiation was requested, 0 if not.
+ */
+int __kdbus_args_parse(struct kdbus_args *args, bool is_cmd, void __user *argp,
+		       size_t type_size, size_t items_offset, void **out)
+{
+	u64 user_size;
+	int ret, i;
+
+	ret = kdbus_copy_from_user(&user_size, argp, sizeof(user_size));
+	if (ret < 0)
+		return ret;
+
+	if (user_size < type_size)
+		return -EINVAL;
+	if (user_size > KDBUS_CMD_MAX_SIZE)
+		return -EMSGSIZE;
+
+	if (user_size <= sizeof(args->cmd_buf)) {
+		if (copy_from_user(args->cmd_buf, argp, user_size))
+			return -EFAULT;
+		args->cmd = (void*)args->cmd_buf;
+	} else {
+		args->cmd = memdup_user(argp, user_size);
+		if (IS_ERR(args->cmd))
+			return PTR_ERR(args->cmd);
+	}
+
+	if (args->cmd->size != user_size) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	if (is_cmd)
+		args->cmd->return_flags = 0;
+	args->user = argp;
+	args->items = (void *)((u8 *)args->cmd + items_offset);
+	args->items_size = args->cmd->size - items_offset;
+	args->is_cmd = is_cmd;
+
+	if (args->cmd->flags & ~args->allowed_flags) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = kdbus_args_verify(args);
+	if (ret < 0)
+		goto error;
+
+	ret = kdbus_args_negotiate(args);
+	if (ret < 0)
+		goto error;
+
+	/* mandatory items must be given (but not on negotiation) */
+	if (!(args->cmd->flags & KDBUS_FLAG_NEGOTIATE)) {
+		for (i = 0; i < args->argc; ++i)
+			if (args->argv[i].mandatory && !args->argv[i].item) {
+				ret = -EINVAL;
+				goto error;
+			}
+	}
+
+	*out = args->cmd;
+	return !!(args->cmd->flags & KDBUS_FLAG_NEGOTIATE);
+
+error:
+	return kdbus_args_clear(args, ret);
+}
+
+/**
+ * kdbus_args_clear() - release allocated command resources
+ * @args:	object to release resources of
+ * @ret:	return value of this command
+ *
+ * This frees all allocated resources on @args and copies the command result
+ * flags into user-space. @ret is usually returned unchanged by this function,
+ * so it can be used in the final 'return' statement of the command handler.
+ *
+ * Return: -EFAULT if return values cannot be copied into user-space, otherwise
+ *         @ret is returned unchanged.
+ */
+int kdbus_args_clear(struct kdbus_args *args, int ret)
+{
+	if (!args)
+		return ret;
+
+	if (!IS_ERR_OR_NULL(args->cmd)) {
+		if (args->is_cmd && put_user(args->cmd->return_flags,
+					     &args->user->return_flags))
+			ret = -EFAULT;
+		if (args->cmd != (void*)args->cmd_buf)
+			kfree(args->cmd);
+		args->cmd = NULL;
+	}
+
+	return ret;
+}
+
+/**
+ * enum kdbus_handle_type - type an handle can be of
+ * @KDBUS_HANDLE_NONE:		no type set, yet
+ * @KDBUS_HANDLE_BUS_OWNER:	bus owner
+ * @KDBUS_HANDLE_EP_OWNER:	endpoint owner
+ * @KDBUS_HANDLE_CONNECTED:	endpoint connection after HELLO
+ */
+enum kdbus_handle_type {
+	KDBUS_HANDLE_NONE,
+	KDBUS_HANDLE_BUS_OWNER,
+	KDBUS_HANDLE_EP_OWNER,
+	KDBUS_HANDLE_CONNECTED,
+};
+
+/**
+ * struct kdbus_handle - handle to the kdbus system
+ * @lock:		handle lock
+ * @type:		type of this handle (KDBUS_HANDLE_*)
+ * @bus_owner:		bus this handle owns
+ * @ep_owner:		endpoint this handle owns
+ * @conn:		connection this handle owns
+ * @privileged:		Flag to mark a handle as privileged
+ */
+struct kdbus_handle {
+	struct mutex lock;
+
+	enum kdbus_handle_type type;
+	union {
+		struct kdbus_bus *bus_owner;
+		struct kdbus_ep *ep_owner;
+		struct kdbus_conn *conn;
+	};
+
+	bool privileged:1;
+};
+
+static int kdbus_handle_open(struct inode *inode, struct file *file)
+{
+	struct kdbus_handle *handle;
+	struct kdbus_node *node;
+	int ret;
+
+	node = kdbus_node_from_inode(inode);
+	if (!kdbus_node_acquire(node))
+		return -ESHUTDOWN;
+
+	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+	if (!handle) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	mutex_init(&handle->lock);
+	handle->type = KDBUS_HANDLE_NONE;
+
+	if (node->type == KDBUS_NODE_ENDPOINT) {
+		struct kdbus_ep *ep = kdbus_ep_from_node(node);
+		struct kdbus_bus *bus = ep->bus;
+
+		/*
+		 * A connection is privileged if it is opened on an endpoint
+		 * without custom policy and either:
+		 *   * the user has CAP_IPC_OWNER in the domain user namespace
+		 * or
+		 *   * the callers euid matches the uid of the bus creator
+		 */
+		if (!ep->user &&
+		    (ns_capable(bus->domain->user_namespace, CAP_IPC_OWNER) ||
+		     uid_eq(file->f_cred->euid, bus->node.uid)))
+			handle->privileged = true;
+	}
+
+	file->private_data = handle;
+	ret = 0;
+
+exit:
+	kdbus_node_release(node);
+	return ret;
+}
+
+static int kdbus_handle_release(struct inode *inode, struct file *file)
+{
+	struct kdbus_handle *handle = file->private_data;
+
+	switch (handle->type) {
+	case KDBUS_HANDLE_BUS_OWNER:
+		if (handle->bus_owner) {
+			kdbus_node_deactivate(&handle->bus_owner->node);
+			kdbus_bus_unref(handle->bus_owner);
+		}
+		break;
+	case KDBUS_HANDLE_EP_OWNER:
+		if (handle->ep_owner) {
+			kdbus_node_deactivate(&handle->ep_owner->node);
+			kdbus_ep_unref(handle->ep_owner);
+		}
+		break;
+	case KDBUS_HANDLE_CONNECTED:
+		kdbus_conn_disconnect(handle->conn, false);
+		kdbus_conn_unref(handle->conn);
+		break;
+	case KDBUS_HANDLE_NONE:
+		/* nothing to clean up */
+		break;
+	}
+
+	kfree(handle);
+
+	return 0;
+}
+
+static long kdbus_handle_ioctl_control(struct file *file, unsigned int cmd,
+				       void __user *argp)
+{
+	struct kdbus_handle *handle = file->private_data;
+	struct kdbus_node *node = file_inode(file)->i_private;
+	struct kdbus_domain *domain;
+	int ret = 0;
+
+	if (!kdbus_node_acquire(node))
+		return -ESHUTDOWN;
+
+	/*
+	 * The parent of control-nodes is always a domain, make sure to pin it
+	 * so the parent is actually valid.
+	 */
+	domain = kdbus_domain_from_node(node->parent);
+	if (!kdbus_node_acquire(&domain->node)) {
+		kdbus_node_release(node);
+		return -ESHUTDOWN;
+	}
+
+	switch (cmd) {
+	case KDBUS_CMD_BUS_MAKE: {
+		struct kdbus_bus *bus;
+
+		bus = kdbus_cmd_bus_make(domain, argp);
+		if (IS_ERR_OR_NULL(bus)) {
+			ret = PTR_ERR_OR_ZERO(bus);
+			break;
+		}
+
+		handle->bus_owner = bus;
+		ret = KDBUS_HANDLE_BUS_OWNER;
+		break;
+	}
+
+	default:
+		ret = -EBADFD;
+		break;
+	}
+
+	kdbus_node_release(&domain->node);
+	kdbus_node_release(node);
+	return ret;
+}
+
+static long kdbus_handle_ioctl_ep(struct file *file, unsigned int cmd,
+				  void __user *buf)
+{
+	struct kdbus_handle *handle = file->private_data;
+	struct kdbus_node *node = file_inode(file)->i_private;
+	struct kdbus_ep *ep, *file_ep = kdbus_ep_from_node(node);
+	struct kdbus_conn *conn;
+	int ret = 0;
+
+	if (!kdbus_node_acquire(node))
+		return -ESHUTDOWN;
+
+	switch (cmd) {
+	case KDBUS_CMD_ENDPOINT_MAKE:
+		/* creating custom endpoints is a privileged operation */
+		if (!handle->privileged) {
+			ret = -EPERM;
+			break;
+		}
+
+		ep = kdbus_cmd_ep_make(file_ep->bus, buf);
+		if (IS_ERR_OR_NULL(ep)) {
+			ret = PTR_ERR_OR_ZERO(ep);
+			break;
+		}
+
+		handle->ep_owner = ep;
+		ret = KDBUS_HANDLE_EP_OWNER;
+		break;
+
+	case KDBUS_CMD_HELLO:
+		conn = kdbus_cmd_hello(file_ep, handle->privileged, buf);
+		if (IS_ERR_OR_NULL(conn)) {
+			ret = PTR_ERR_OR_ZERO(conn);
+			break;
+		}
+
+		handle->conn = conn;
+		ret = KDBUS_HANDLE_CONNECTED;
+		break;
+
+	default:
+		ret = -EBADFD;
+		break;
+	}
+
+	kdbus_node_release(node);
+	return ret;
+}
+
+static long kdbus_handle_ioctl_ep_owner(struct file *file, unsigned int command,
+					void __user *buf)
+{
+	struct kdbus_handle *handle = file->private_data;
+	struct kdbus_ep *ep = handle->ep_owner;
+	int ret;
+
+	if (!kdbus_node_acquire(&ep->node))
+		return -ESHUTDOWN;
+
+	switch (command) {
+	case KDBUS_CMD_ENDPOINT_UPDATE:
+		ret = kdbus_cmd_ep_update(ep, buf);
+		break;
+	default:
+		ret = -EBADFD;
+		break;
+	}
+
+	kdbus_node_release(&ep->node);
+	return ret;
+}
+
+static long kdbus_handle_ioctl_connected(struct file *file,
+					 unsigned int command, void __user *buf)
+{
+	struct kdbus_handle *handle = file->private_data;
+	struct kdbus_conn *conn = handle->conn;
+	struct kdbus_conn *release_conn = NULL;
+	int ret;
+
+	release_conn = conn;
+	ret = kdbus_conn_acquire(release_conn);
+	if (ret < 0)
+		return ret;
+
+	switch (command) {
+	case KDBUS_CMD_BYEBYE:
+		/*
+		 * BYEBYE is special; we must not acquire a connection when
+		 * calling into kdbus_conn_disconnect() or we will deadlock,
+		 * because kdbus_conn_disconnect() will wait for all acquired
+		 * references to be dropped.
+		 */
+		kdbus_conn_release(release_conn);
+		release_conn = NULL;
+		ret = kdbus_cmd_byebye_unlocked(conn, buf);
+		break;
+	case KDBUS_CMD_NAME_ACQUIRE:
+		ret = kdbus_cmd_name_acquire(conn, buf);
+		break;
+	case KDBUS_CMD_NAME_RELEASE:
+		ret = kdbus_cmd_name_release(conn, buf);
+		break;
+	case KDBUS_CMD_LIST:
+		ret = kdbus_cmd_list(conn, buf);
+		break;
+	case KDBUS_CMD_CONN_INFO:
+		ret = kdbus_cmd_conn_info(conn, buf);
+		break;
+	case KDBUS_CMD_BUS_CREATOR_INFO:
+		ret = kdbus_cmd_bus_creator_info(conn, buf);
+		break;
+	case KDBUS_CMD_UPDATE:
+		ret = kdbus_cmd_update(conn, buf);
+		break;
+	case KDBUS_CMD_MATCH_ADD:
+		ret = kdbus_cmd_match_add(conn, buf);
+		break;
+	case KDBUS_CMD_MATCH_REMOVE:
+		ret = kdbus_cmd_match_remove(conn, buf);
+		break;
+	case KDBUS_CMD_SEND:
+		ret = kdbus_cmd_send(conn, file, buf);
+		break;
+	case KDBUS_CMD_RECV:
+		ret = kdbus_cmd_recv(conn, buf);
+		break;
+	case KDBUS_CMD_FREE:
+		ret = kdbus_cmd_free(conn, buf);
+		break;
+	default:
+		ret = -EBADFD;
+		break;
+	}
+
+	kdbus_conn_release(release_conn);
+	return ret;
+}
+
+static long kdbus_handle_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg)
+{
+	struct kdbus_handle *handle = file->private_data;
+	struct kdbus_node *node = kdbus_node_from_inode(file_inode(file));
+	void __user *argp = (void __user *)arg;
+	long ret = -EBADFD;
+
+	switch (cmd) {
+	case KDBUS_CMD_BUS_MAKE:
+	case KDBUS_CMD_ENDPOINT_MAKE:
+	case KDBUS_CMD_HELLO:
+		mutex_lock(&handle->lock);
+		if (handle->type == KDBUS_HANDLE_NONE) {
+			if (node->type == KDBUS_NODE_CONTROL)
+				ret = kdbus_handle_ioctl_control(file, cmd,
+								 argp);
+			else if (node->type == KDBUS_NODE_ENDPOINT)
+				ret = kdbus_handle_ioctl_ep(file, cmd, argp);
+
+			if (ret > 0) {
+				/*
+				 * The data given via open() is not sufficient
+				 * to setup a kdbus handle. Hence, we require
+				 * the user to perform a setup ioctl. This setup
+				 * can only be performed once and defines the
+				 * type of the handle. The different setup
+				 * ioctls are locked against each other so they
+				 * cannot race. Once the handle type is set,
+				 * the type-dependent ioctls are enabled. To
+				 * improve performance, we don't lock those via
+				 * handle->lock. Instead, we issue a
+				 * write-barrier before performing the
+				 * type-change, which pairs with smp_rmb() in
+				 * all handlers that access the type field. This
+				 * guarantees the handle is fully setup, if
+				 * handle->type is set. If handle->type is
+				 * unset, you must not make any assumptions
+				 * without taking handle->lock.
+				 * Note that handle->type is only set once. It
+				 * will never change afterwards.
+				 */
+				smp_wmb();
+				handle->type = ret;
+			}
+		}
+		mutex_unlock(&handle->lock);
+		break;
+
+	case KDBUS_CMD_ENDPOINT_UPDATE:
+	case KDBUS_CMD_BYEBYE:
+	case KDBUS_CMD_NAME_ACQUIRE:
+	case KDBUS_CMD_NAME_RELEASE:
+	case KDBUS_CMD_LIST:
+	case KDBUS_CMD_CONN_INFO:
+	case KDBUS_CMD_BUS_CREATOR_INFO:
+	case KDBUS_CMD_UPDATE:
+	case KDBUS_CMD_MATCH_ADD:
+	case KDBUS_CMD_MATCH_REMOVE:
+	case KDBUS_CMD_SEND:
+	case KDBUS_CMD_RECV:
+	case KDBUS_CMD_FREE: {
+		enum kdbus_handle_type type;
+
+		/*
+		 * This read-barrier pairs with smp_wmb() of the handle setup.
+		 * it guarantees the handle is fully written, in case the
+		 * type has been set. It allows us to access the handle without
+		 * taking handle->lock, given the guarantee that the type is
+		 * only ever set once, and stays constant afterwards.
+		 * Furthermore, the handle object itself is not modified in any
+		 * way after the type is set. That is, the type-field is the
+		 * last field that is written on any handle. If it has not been
+		 * set, we must not access the handle here.
+		 */
+		type = handle->type;
+		smp_rmb();
+
+		if (type == KDBUS_HANDLE_EP_OWNER)
+			ret = kdbus_handle_ioctl_ep_owner(file, cmd, argp);
+		else if (type == KDBUS_HANDLE_CONNECTED)
+			ret = kdbus_handle_ioctl_connected(file, cmd, argp);
+
+		break;
+	}
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	return ret < 0 ? ret : 0;
+}
+
+static unsigned int kdbus_handle_poll(struct file *file,
+				      struct poll_table_struct *wait)
+{
+	struct kdbus_handle *handle = file->private_data;
+	enum kdbus_handle_type type;
+	unsigned int mask = POLLOUT | POLLWRNORM;
+
+	/*
+	 * This pairs with smp_wmb() during handle setup. It guarantees that
+	 * _iff_ the handle type is set, handle->conn is valid. Furthermore,
+	 * _iff_ the type is set, the handle object is constant and never
+	 * changed again. If it's not set, we must not access the handle but
+	 * bail out. We also must assume no setup has taken place, yet.
+	 */
+	type = handle->type;
+	smp_rmb();
+
+	/* Only a connected endpoint can read/write data */
+	if (type != KDBUS_HANDLE_CONNECTED)
+		return POLLERR | POLLHUP;
+
+	poll_wait(file, &handle->conn->wait, wait);
+
+	/*
+	 * Verify the connection hasn't been deactivated _after_ adding the
+	 * wait-queue. This guarantees, that if the connection is deactivated
+	 * after we checked it, the waitqueue is signaled and we're called
+	 * again.
+	 */
+	if (!kdbus_conn_active(handle->conn))
+		return POLLERR | POLLHUP;
+
+	if (!list_empty(&handle->conn->queue.msg_list) ||
+	    atomic_read(&handle->conn->lost_count) > 0)
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static int kdbus_handle_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct kdbus_handle *handle = file->private_data;
+	enum kdbus_handle_type type;
+	int ret = -EBADFD;
+
+	/*
+	 * This pairs with smp_wmb() during handle setup. It guarantees that
+	 * _iff_ the handle type is set, handle->conn is valid. Furthermore,
+	 * _iff_ the type is set, the handle object is constant and never
+	 * changed again. If it's not set, we must not access the handle but
+	 * bail out. We also must assume no setup has taken place, yet.
+	 */
+	type = handle->type;
+	smp_rmb();
+
+	/* Only connected handles have a pool we can map */
+	if (type == KDBUS_HANDLE_CONNECTED)
+		ret = kdbus_pool_mmap(handle->conn->pool, vma);
+
+	return ret;
+}
+
+const struct file_operations kdbus_handle_ops = {
+	.owner =		THIS_MODULE,
+	.open =			kdbus_handle_open,
+	.release =		kdbus_handle_release,
+	.poll =			kdbus_handle_poll,
+	.llseek =		noop_llseek,
+	.unlocked_ioctl =	kdbus_handle_ioctl,
+	.mmap =			kdbus_handle_mmap,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl =		kdbus_handle_ioctl,
+#endif
+};
diff --git a/ipc/kdbus/handle.h b/ipc/kdbus/handle.h
new file mode 100644
index 000000000..8a36c0595
--- /dev/null
+++ b/ipc/kdbus/handle.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_HANDLE_H
+#define __KDBUS_HANDLE_H
+
+#include <linux/fs.h>
+#include <uapi/linux/kdbus.h>
+
+extern const struct file_operations kdbus_handle_ops;
+
+/**
+ * kdbus_arg - information and state of a single ioctl command item
+ * @type:		item type
+ * @item:		set by the parser to the first found item of this type
+ * @multiple:		whether multiple items of this type are allowed
+ * @mandatory:		whether at least one item of this type is required
+ *
+ * This structure describes a single item in an ioctl command payload. The
+ * caller has to pre-fill the type and flags, the parser will then use this
+ * information to verify the ioctl payload. @item is set by the parser to point
+ * to the first occurrence of the item.
+ */
+struct kdbus_arg {
+	u64 type;
+	struct kdbus_item *item;
+	bool multiple : 1;
+	bool mandatory : 1;
+};
+
+/**
+ * kdbus_args - information and state of ioctl command parser
+ * @allowed_flags:	set of flags this command supports
+ * @argc:		number of items in @argv
+ * @argv:		array of items this command supports
+ * @user:		set by parser to user-space location of current command
+ * @cmd:		set by parser to kernel copy of command payload
+ * @cmd_buf:		512 bytes inline buf to avoid kmalloc() on small cmds
+ * @items:		points to item array in @cmd
+ * @items_size:		size of @items in bytes
+ * @is_cmd:		whether this is a command-payload or msg-payload
+ *
+ * This structure is used to parse ioctl command payloads on each invocation.
+ * The ioctl handler has to pre-fill the flags and allowed items before passing
+ * the object to kdbus_args_parse(). The parser will copy the command payload
+ * into kernel-space and verify the correctness of the data.
+ *
+ * We use a 512 bytes buffer for small command payloads, to be allocated on
+ * stack on syscall entrance.
+ */
+struct kdbus_args {
+	u64 allowed_flags;
+	size_t argc;
+	struct kdbus_arg *argv;
+
+	struct kdbus_cmd __user *user;
+	struct kdbus_cmd *cmd;
+	u8 cmd_buf[512];
+
+	struct kdbus_item *items;
+	size_t items_size;
+	bool is_cmd : 1;
+};
+
+int __kdbus_args_parse(struct kdbus_args *args, bool is_cmd, void __user *argp,
+		       size_t type_size, size_t items_offset, void **out);
+int kdbus_args_clear(struct kdbus_args *args, int ret);
+
+#define kdbus_args_parse(_args, _argp, _v)                              \
+	({                                                              \
+		BUILD_BUG_ON(offsetof(typeof(**(_v)), size) !=          \
+			     offsetof(struct kdbus_cmd, size));         \
+		BUILD_BUG_ON(offsetof(typeof(**(_v)), flags) !=         \
+			     offsetof(struct kdbus_cmd, flags));        \
+		BUILD_BUG_ON(offsetof(typeof(**(_v)), return_flags) !=  \
+			     offsetof(struct kdbus_cmd, return_flags)); \
+		__kdbus_args_parse((_args), 1, (_argp), sizeof(**(_v)), \
+				   offsetof(typeof(**(_v)), items),     \
+				   (void **)(_v));                      \
+	})
+
+#define kdbus_args_parse_msg(_args, _argp, _v)                          \
+	({                                                              \
+		BUILD_BUG_ON(offsetof(typeof(**(_v)), size) !=          \
+			     offsetof(struct kdbus_cmd, size));         \
+		BUILD_BUG_ON(offsetof(typeof(**(_v)), flags) !=         \
+			     offsetof(struct kdbus_cmd, flags));        \
+		__kdbus_args_parse((_args), 0, (_argp), sizeof(**(_v)), \
+				   offsetof(typeof(**(_v)), items),     \
+				   (void **)(_v));                      \
+	})
+
+#endif
diff --git a/ipc/kdbus/item.c b/ipc/kdbus/item.c
new file mode 100644
index 000000000..ce78dba03
--- /dev/null
+++ b/ipc/kdbus/item.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+
+#include "item.h"
+#include "limits.h"
+#include "util.h"
+
+/*
+ * This verifies the string at position @str with size @size is properly
+ * zero-terminated and does not contain a 0-byte but at the end.
+ */
+static bool kdbus_str_valid(const char *str, size_t size)
+{
+	return size > 0 && memchr(str, '\0', size) == str + size - 1;
+}
+
+/**
+ * kdbus_item_validate_name() - validate an item containing a name
+ * @item:		Item to validate
+ *
+ * Return: zero on success or an negative error code on failure
+ */
+int kdbus_item_validate_name(const struct kdbus_item *item)
+{
+	const char *name = item->str;
+	unsigned int i;
+	size_t len;
+
+	if (item->size < KDBUS_ITEM_HEADER_SIZE + 2)
+		return -EINVAL;
+
+	if (item->size > KDBUS_ITEM_HEADER_SIZE +
+			 KDBUS_SYSNAME_MAX_LEN + 1)
+		return -ENAMETOOLONG;
+
+	if (!kdbus_str_valid(name, KDBUS_ITEM_PAYLOAD_SIZE(item)))
+		return -EINVAL;
+
+	len = strlen(name);
+	if (len == 0)
+		return -EINVAL;
+
+	for (i = 0; i < len; i++) {
+		if (isalpha(name[i]))
+			continue;
+		if (isdigit(name[i]))
+			continue;
+		if (name[i] == '_')
+			continue;
+		if (i > 0 && i + 1 < len && (name[i] == '-' || name[i] == '.'))
+			continue;
+
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * kdbus_item_validate() - validate a single item
+ * @item:	item to validate
+ *
+ * Return: 0 if item is valid, negative error code if not.
+ */
+int kdbus_item_validate(const struct kdbus_item *item)
+{
+	size_t payload_size = KDBUS_ITEM_PAYLOAD_SIZE(item);
+	size_t l;
+	int ret;
+
+	BUILD_BUG_ON(KDBUS_ITEM_HEADER_SIZE !=
+		     sizeof(struct kdbus_item_header));
+
+	if (item->size < KDBUS_ITEM_HEADER_SIZE)
+		return -EINVAL;
+
+	switch (item->type) {
+	case KDBUS_ITEM_NEGOTIATE:
+		if (payload_size % sizeof(u64) != 0)
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_PAYLOAD_VEC:
+	case KDBUS_ITEM_PAYLOAD_OFF:
+		if (payload_size != sizeof(struct kdbus_vec))
+			return -EINVAL;
+		if (item->vec.size == 0 || item->vec.size > SIZE_MAX)
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_PAYLOAD_MEMFD:
+		if (payload_size != sizeof(struct kdbus_memfd))
+			return -EINVAL;
+		if (item->memfd.size == 0 || item->memfd.size > SIZE_MAX)
+			return -EINVAL;
+		if (item->memfd.fd < 0)
+			return -EBADF;
+		break;
+
+	case KDBUS_ITEM_FDS:
+		if (payload_size % sizeof(int) != 0)
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_CANCEL_FD:
+		if (payload_size != sizeof(int))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_BLOOM_PARAMETER:
+		if (payload_size != sizeof(struct kdbus_bloom_parameter))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_BLOOM_FILTER:
+		/* followed by the bloom-mask, depends on the bloom-size */
+		if (payload_size < sizeof(struct kdbus_bloom_filter))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_BLOOM_MASK:
+		/* size depends on bloom-size of bus */
+		break;
+
+	case KDBUS_ITEM_CONN_DESCRIPTION:
+	case KDBUS_ITEM_MAKE_NAME:
+		ret = kdbus_item_validate_name(item);
+		if (ret < 0)
+			return ret;
+		break;
+
+	case KDBUS_ITEM_ATTACH_FLAGS_SEND:
+	case KDBUS_ITEM_ATTACH_FLAGS_RECV:
+	case KDBUS_ITEM_ID:
+	case KDBUS_ITEM_DST_ID:
+		if (payload_size != sizeof(u64))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_TIMESTAMP:
+		if (payload_size != sizeof(struct kdbus_timestamp))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_CREDS:
+		if (payload_size != sizeof(struct kdbus_creds))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_AUXGROUPS:
+		if (payload_size % sizeof(u32) != 0)
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_NAME:
+	case KDBUS_ITEM_DST_NAME:
+	case KDBUS_ITEM_PID_COMM:
+	case KDBUS_ITEM_TID_COMM:
+	case KDBUS_ITEM_EXE:
+	case KDBUS_ITEM_CMDLINE:
+	case KDBUS_ITEM_CGROUP:
+	case KDBUS_ITEM_SECLABEL:
+		if (!kdbus_str_valid(item->str, payload_size))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_CAPS:
+		if (payload_size < sizeof(u32))
+			return -EINVAL;
+		if (payload_size < sizeof(u32) +
+		    4 * CAP_TO_INDEX(item->caps.last_cap) * sizeof(u32))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_AUDIT:
+		if (payload_size != sizeof(struct kdbus_audit))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_POLICY_ACCESS:
+		if (payload_size != sizeof(struct kdbus_policy_access))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_NAME_ADD:
+	case KDBUS_ITEM_NAME_REMOVE:
+	case KDBUS_ITEM_NAME_CHANGE:
+		if (payload_size < sizeof(struct kdbus_notify_name_change))
+			return -EINVAL;
+		l = payload_size - offsetof(struct kdbus_notify_name_change,
+					    name);
+		if (l > 0 && !kdbus_str_valid(item->name_change.name, l))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_ID_ADD:
+	case KDBUS_ITEM_ID_REMOVE:
+		if (payload_size != sizeof(struct kdbus_notify_id_change))
+			return -EINVAL;
+		break;
+
+	case KDBUS_ITEM_REPLY_TIMEOUT:
+	case KDBUS_ITEM_REPLY_DEAD:
+		if (payload_size != 0)
+			return -EINVAL;
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/**
+ * kdbus_items_validate() - validate items passed by user-space
+ * @items:		items to validate
+ * @items_size:		number of items
+ *
+ * This verifies that the passed items pointer is consistent and valid.
+ * Furthermore, each item is checked for:
+ *  - valid "size" value
+ *  - payload is of expected type
+ *  - payload is fully included in the item
+ *  - string payloads are zero-terminated
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_items_validate(const struct kdbus_item *items, size_t items_size)
+{
+	const struct kdbus_item *item;
+	int ret;
+
+	KDBUS_ITEMS_FOREACH(item, items, items_size) {
+		if (!KDBUS_ITEM_VALID(item, items, items_size))
+			return -EINVAL;
+
+		ret = kdbus_item_validate(item);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (!KDBUS_ITEMS_END(item, items, items_size))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * kdbus_item_set() - Set item content
+ * @item:	The item to modify
+ * @type:	The item type to set (KDBUS_ITEM_*)
+ * @data:	Data to copy to item->data, may be %NULL
+ * @len:	Number of bytes in @data
+ *
+ * This sets type, size and data fields of an item. If @data is NULL, the data
+ * memory is cleared.
+ *
+ * Note that you must align your @data memory to 8 bytes. Trailing padding (in
+ * case @len is not 8byte aligned) is cleared by this call.
+ *
+ * Returns: Pointer to the following item.
+ */
+struct kdbus_item *kdbus_item_set(struct kdbus_item *item, u64 type,
+				  const void *data, size_t len)
+{
+	item->type = type;
+	item->size = KDBUS_ITEM_HEADER_SIZE + len;
+
+	if (data) {
+		memcpy(item->data, data, len);
+		memset(item->data + len, 0, KDBUS_ALIGN8(len) - len);
+	} else {
+		memset(item->data, 0, KDBUS_ALIGN8(len));
+	}
+
+	return KDBUS_ITEM_NEXT(item);
+}
diff --git a/ipc/kdbus/item.h b/ipc/kdbus/item.h
new file mode 100644
index 000000000..3a7e6ccc2
--- /dev/null
+++ b/ipc/kdbus/item.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_ITEM_H
+#define __KDBUS_ITEM_H
+
+#include <linux/kernel.h>
+#include <uapi/linux/kdbus.h>
+
+#include "util.h"
+
+/* generic access and iterators over a stream of items */
+#define KDBUS_ITEM_NEXT(_i) (typeof(_i))((u8 *)(_i) + KDBUS_ALIGN8((_i)->size))
+#define KDBUS_ITEMS_SIZE(_h, _is) ((_h)->size - offsetof(typeof(*(_h)), _is))
+#define KDBUS_ITEM_HEADER_SIZE offsetof(struct kdbus_item, data)
+#define KDBUS_ITEM_SIZE(_s) KDBUS_ALIGN8(KDBUS_ITEM_HEADER_SIZE + (_s))
+#define KDBUS_ITEM_PAYLOAD_SIZE(_i) ((_i)->size - KDBUS_ITEM_HEADER_SIZE)
+
+#define KDBUS_ITEMS_FOREACH(_i, _is, _s)				\
+	for ((_i) = (_is);						\
+	     ((u8 *)(_i) < (u8 *)(_is) + (_s)) &&			\
+	       ((u8 *)(_i) >= (u8 *)(_is));				\
+	     (_i) = KDBUS_ITEM_NEXT(_i))
+
+#define KDBUS_ITEM_VALID(_i, _is, _s)					\
+	((_i)->size >= KDBUS_ITEM_HEADER_SIZE &&			\
+	 (u8 *)(_i) + (_i)->size > (u8 *)(_i) &&			\
+	 (u8 *)(_i) + (_i)->size <= (u8 *)(_is) + (_s) &&		\
+	 (u8 *)(_i) >= (u8 *)(_is))
+
+#define KDBUS_ITEMS_END(_i, _is, _s)					\
+	((u8 *)(_i) == ((u8 *)(_is) + KDBUS_ALIGN8(_s)))
+
+/**
+ * struct kdbus_item_header - Describes the fix part of an item
+ * @size:	The total size of the item
+ * @type:	The item type, one of KDBUS_ITEM_*
+ */
+struct kdbus_item_header {
+	u64 size;
+	u64 type;
+};
+
+int kdbus_item_validate_name(const struct kdbus_item *item);
+int kdbus_item_validate(const struct kdbus_item *item);
+int kdbus_items_validate(const struct kdbus_item *items, size_t items_size);
+struct kdbus_item *kdbus_item_set(struct kdbus_item *item, u64 type,
+				  const void *data, size_t len);
+
+#endif
diff --git a/ipc/kdbus/limits.h b/ipc/kdbus/limits.h
new file mode 100644
index 000000000..c54925a25
--- /dev/null
+++ b/ipc/kdbus/limits.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_DEFAULTS_H
+#define __KDBUS_DEFAULTS_H
+
+#include <linux/kernel.h>
+
+/* maximum size of message header and items */
+#define KDBUS_MSG_MAX_SIZE		SZ_8K
+
+/* maximum number of memfd items per message */
+#define KDBUS_MSG_MAX_MEMFD_ITEMS	16
+
+/* max size of ioctl command data */
+#define KDBUS_CMD_MAX_SIZE		SZ_32K
+
+/* maximum number of inflight fds in a target queue per user */
+#define KDBUS_CONN_MAX_FDS_PER_USER	16
+
+/* maximum message payload size */
+#define KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE		SZ_2M
+
+/* maximum size of bloom bit field in bytes */
+#define KDBUS_BUS_BLOOM_MAX_SIZE		SZ_4K
+
+/* maximum length of well-known bus name */
+#define KDBUS_NAME_MAX_LEN			255
+
+/* maximum length of bus, domain, ep name */
+#define KDBUS_SYSNAME_MAX_LEN			63
+
+/* maximum number of matches per connection */
+#define KDBUS_MATCH_MAX				256
+
+/* maximum number of queued messages from the same individual user */
+#define KDBUS_CONN_MAX_MSGS			256
+
+/* maximum number of well-known names per connection */
+#define KDBUS_CONN_MAX_NAMES			256
+
+/* maximum number of queued requests waiting for a reply */
+#define KDBUS_CONN_MAX_REQUESTS_PENDING		128
+
+/* maximum number of connections per user in one domain */
+#define KDBUS_USER_MAX_CONN			1024
+
+/* maximum number of buses per user in one domain */
+#define KDBUS_USER_MAX_BUSES			16
+
+#endif
diff --git a/ipc/kdbus/main.c b/ipc/kdbus/main.c
new file mode 100644
index 000000000..1ad4dc8da
--- /dev/null
+++ b/ipc/kdbus/main.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#define pr_fmt(fmt)    KBUILD_MODNAME ": " fmt
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include "util.h"
+#include "fs.h"
+#include "handle.h"
+#include "metadata.h"
+#include "node.h"
+
+/*
+ * This is a simplified outline of the internal kdbus object relations, for
+ * those interested in the inner life of the driver implementation.
+ *
+ * From a mount point's (domain's) perspective:
+ *
+ * struct kdbus_domain
+ *   |» struct kdbus_user *user (many, owned)
+ *   '» struct kdbus_node node (embedded)
+ *       |» struct kdbus_node children (many, referenced)
+ *       |» struct kdbus_node *parent (pinned)
+ *       '» struct kdbus_bus (many, pinned)
+ *           |» struct kdbus_node node (embedded)
+ *           '» struct kdbus_ep (many, pinned)
+ *               |» struct kdbus_node node (embedded)
+ *               |» struct kdbus_bus *bus (pinned)
+ *               |» struct kdbus_conn conn_list (many, pinned)
+ *               |   |» struct kdbus_ep *ep (pinned)
+ *               |   |» struct kdbus_name_entry *activator_of (owned)
+ *               |   |» struct kdbus_match_db *match_db (owned)
+ *               |   |» struct kdbus_meta *meta (owned)
+ *               |   |» struct kdbus_match_db *match_db (owned)
+ *               |   |    '» struct kdbus_match_entry (many, owned)
+ *               |   |
+ *               |   |» struct kdbus_pool *pool (owned)
+ *               |   |    '» struct kdbus_pool_slice *slices (many, owned)
+ *               |   |       '» struct kdbus_pool *pool (pinned)
+ *               |   |
+ *               |   |» struct kdbus_user *user (pinned)
+ *               |   `» struct kdbus_queue_entry entries (many, embedded)
+ *               |        |» struct kdbus_pool_slice *slice (pinned)
+ *               |        |» struct kdbus_conn_reply *reply (owned)
+ *               |        '» struct kdbus_user *user (pinned)
+ *               |
+ *               '» struct kdbus_user *user (pinned)
+ *                   '» struct kdbus_policy_db policy_db (embedded)
+ *                        |» struct kdbus_policy_db_entry (many, owned)
+ *                        |   |» struct kdbus_conn (pinned)
+ *                        |   '» struct kdbus_ep (pinned)
+ *                        |
+ *                        '» struct kdbus_policy_db_cache_entry (many, owned)
+ *                            '» struct kdbus_conn (pinned)
+ *
+ * For the life-time of a file descriptor derived from calling open() on a file
+ * inside the mount point:
+ *
+ * struct kdbus_handle
+ *  |» struct kdbus_meta *meta (owned)
+ *  |» struct kdbus_ep *ep (pinned)
+ *  |» struct kdbus_conn *conn (owned)
+ *  '» struct kdbus_ep *ep (owned)
+ */
+
+/* kdbus mount-point /sys/fs/kdbus */
+static struct kobject *kdbus_dir;
+
+static int __init kdbus_init(void)
+{
+	int ret;
+
+	kdbus_dir = kobject_create_and_add(KBUILD_MODNAME, fs_kobj);
+	if (!kdbus_dir)
+		return -ENOMEM;
+
+	ret = kdbus_fs_init();
+	if (ret < 0) {
+		pr_err("cannot register filesystem: %d\n", ret);
+		goto exit_dir;
+	}
+
+	pr_info("initialized\n");
+	return 0;
+
+exit_dir:
+	kobject_put(kdbus_dir);
+	return ret;
+}
+
+static void __exit kdbus_exit(void)
+{
+	kdbus_fs_exit();
+	kobject_put(kdbus_dir);
+	ida_destroy(&kdbus_node_ida);
+}
+
+module_init(kdbus_init);
+module_exit(kdbus_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("D-Bus, powerful, easy to use interprocess communication");
+MODULE_ALIAS_FS(KBUILD_MODNAME "fs");
diff --git a/ipc/kdbus/match.c b/ipc/kdbus/match.c
new file mode 100644
index 000000000..4ee6a1f2e
--- /dev/null
+++ b/ipc/kdbus/match.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "match.h"
+#include "message.h"
+#include "names.h"
+
+/**
+ * struct kdbus_match_db - message filters
+ * @entries_list:	List of matches
+ * @mdb_rwlock:		Match data lock
+ * @entries_count:	Number of entries in database
+ */
+struct kdbus_match_db {
+	struct list_head entries_list;
+	struct rw_semaphore mdb_rwlock;
+	unsigned int entries_count;
+};
+
+/**
+ * struct kdbus_match_entry - a match database entry
+ * @cookie:		User-supplied cookie to lookup the entry
+ * @list_entry:		The list entry element for the db list
+ * @rules_list:		The list head for tracking rules of this entry
+ */
+struct kdbus_match_entry {
+	u64 cookie;
+	struct list_head list_entry;
+	struct list_head rules_list;
+};
+
+/**
+ * struct kdbus_bloom_mask - mask to match against filter
+ * @generations:	Number of generations carried
+ * @data:		Array of bloom bit fields
+ */
+struct kdbus_bloom_mask {
+	u64 generations;
+	u64 *data;
+};
+
+/**
+ * struct kdbus_match_rule - a rule appended to a match entry
+ * @type:		An item type to match against
+ * @bloom_mask:		Bloom mask to match a message's filter against, used
+ *			with KDBUS_ITEM_BLOOM_MASK
+ * @name:		Name to match against, used with KDBUS_ITEM_NAME,
+ *			KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}
+ * @old_id:		ID to match against, used with
+ *			KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE},
+ *			KDBUS_ITEM_ID_REMOVE
+ * @new_id:		ID to match against, used with
+ *			KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE},
+ *			KDBUS_ITEM_ID_REMOVE
+ * @src_id:		ID to match against, used with KDBUS_ITEM_ID
+ * @dst_id:		Message destination ID, used with KDBUS_ITEM_DST_ID
+ * @rules_entry:	Entry in the entry's rules list
+ */
+struct kdbus_match_rule {
+	u64 type;
+	union {
+		struct kdbus_bloom_mask bloom_mask;
+		struct {
+			char *name;
+			u64 old_id;
+			u64 new_id;
+		};
+		u64 src_id;
+		u64 dst_id;
+	};
+	struct list_head rules_entry;
+};
+
+static void kdbus_match_rule_free(struct kdbus_match_rule *rule)
+{
+	if (!rule)
+		return;
+
+	switch (rule->type) {
+	case KDBUS_ITEM_BLOOM_MASK:
+		kfree(rule->bloom_mask.data);
+		break;
+
+	case KDBUS_ITEM_NAME:
+	case KDBUS_ITEM_NAME_ADD:
+	case KDBUS_ITEM_NAME_REMOVE:
+	case KDBUS_ITEM_NAME_CHANGE:
+		kfree(rule->name);
+		break;
+
+	case KDBUS_ITEM_ID:
+	case KDBUS_ITEM_DST_ID:
+	case KDBUS_ITEM_ID_ADD:
+	case KDBUS_ITEM_ID_REMOVE:
+		break;
+
+	default:
+		BUG();
+	}
+
+	list_del(&rule->rules_entry);
+	kfree(rule);
+}
+
+static void kdbus_match_entry_free(struct kdbus_match_entry *entry)
+{
+	struct kdbus_match_rule *r, *tmp;
+
+	if (!entry)
+		return;
+
+	list_for_each_entry_safe(r, tmp, &entry->rules_list, rules_entry)
+		kdbus_match_rule_free(r);
+
+	list_del(&entry->list_entry);
+	kfree(entry);
+}
+
+/**
+ * kdbus_match_db_free() - free match db resources
+ * @mdb:		The match database
+ */
+void kdbus_match_db_free(struct kdbus_match_db *mdb)
+{
+	struct kdbus_match_entry *entry, *tmp;
+
+	if (!mdb)
+		return;
+
+	list_for_each_entry_safe(entry, tmp, &mdb->entries_list, list_entry)
+		kdbus_match_entry_free(entry);
+
+	kfree(mdb);
+}
+
+/**
+ * kdbus_match_db_new() - create a new match database
+ *
+ * Return: a new kdbus_match_db on success, ERR_PTR on failure.
+ */
+struct kdbus_match_db *kdbus_match_db_new(void)
+{
+	struct kdbus_match_db *d;
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return ERR_PTR(-ENOMEM);
+
+	init_rwsem(&d->mdb_rwlock);
+	INIT_LIST_HEAD(&d->entries_list);
+
+	return d;
+}
+
+static bool kdbus_match_bloom(const struct kdbus_bloom_filter *filter,
+			      const struct kdbus_bloom_mask *mask,
+			      const struct kdbus_conn *conn)
+{
+	size_t n = conn->ep->bus->bloom.size / sizeof(u64);
+	const u64 *m;
+	size_t i;
+
+	/*
+	 * The message's filter carries a generation identifier, the
+	 * match's mask possibly carries an array of multiple generations
+	 * of the mask. Select the mask with the closest match of the
+	 * filter's generation.
+	 */
+	m = mask->data + (min(filter->generation, mask->generations - 1) * n);
+
+	/*
+	 * The message's filter contains the messages properties,
+	 * the match's mask contains the properties to look for in the
+	 * message. Check the mask bit field against the filter bit field,
+	 * if the message possibly carries the properties the connection
+	 * has subscribed to.
+	 */
+	for (i = 0; i < n; i++)
+		if ((filter->data[i] & m[i]) != m[i])
+			return false;
+
+	return true;
+}
+
+static bool kdbus_match_rule_conn(const struct kdbus_match_rule *r,
+				  struct kdbus_conn *c,
+				  const struct kdbus_staging *s)
+{
+	lockdep_assert_held(&c->ep->bus->name_registry->rwlock);
+
+	switch (r->type) {
+	case KDBUS_ITEM_BLOOM_MASK:
+		return kdbus_match_bloom(s->bloom_filter, &r->bloom_mask, c);
+	case KDBUS_ITEM_ID:
+		return r->src_id == c->id || r->src_id == KDBUS_MATCH_ID_ANY;
+	case KDBUS_ITEM_DST_ID:
+		return r->dst_id == s->msg->dst_id ||
+		       r->dst_id == KDBUS_MATCH_ID_ANY;
+	case KDBUS_ITEM_NAME:
+		return kdbus_conn_has_name(c, r->name);
+	default:
+		return false;
+	}
+}
+
+static bool kdbus_match_rule_kernel(const struct kdbus_match_rule *r,
+				    const struct kdbus_staging *s)
+{
+	struct kdbus_item *n = s->notify;
+
+	if (WARN_ON(!n) || n->type != r->type)
+		return false;
+
+	switch (r->type) {
+	case KDBUS_ITEM_ID_ADD:
+		return r->new_id == KDBUS_MATCH_ID_ANY ||
+		       r->new_id == n->id_change.id;
+	case KDBUS_ITEM_ID_REMOVE:
+		return r->old_id == KDBUS_MATCH_ID_ANY ||
+		       r->old_id == n->id_change.id;
+	case KDBUS_ITEM_NAME_ADD:
+	case KDBUS_ITEM_NAME_CHANGE:
+	case KDBUS_ITEM_NAME_REMOVE:
+		return (r->old_id == KDBUS_MATCH_ID_ANY ||
+		        r->old_id == n->name_change.old_id.id) &&
+		       (r->new_id == KDBUS_MATCH_ID_ANY ||
+		        r->new_id == n->name_change.new_id.id) &&
+		       (!r->name || !strcmp(r->name, n->name_change.name));
+	default:
+		return false;
+	}
+}
+
+static bool kdbus_match_rules(const struct kdbus_match_entry *entry,
+			      struct kdbus_conn *c,
+			      const struct kdbus_staging *s)
+{
+	struct kdbus_match_rule *r;
+
+	list_for_each_entry(r, &entry->rules_list, rules_entry)
+		if ((c && !kdbus_match_rule_conn(r, c, s)) ||
+		    (!c && !kdbus_match_rule_kernel(r, s)))
+			return false;
+
+	return true;
+}
+
+/**
+ * kdbus_match_db_match_msg() - match a msg object agains the database entries
+ * @mdb:		The match database
+ * @conn_src:		The connection object originating the message
+ * @staging:		Staging object containing the message to match against
+ *
+ * This function will walk through all the database entries previously uploaded
+ * with kdbus_match_db_add(). As soon as any of them has an all-satisfied rule
+ * set, this function will return true.
+ *
+ * The caller must hold the registry lock of conn_src->ep->bus, in case conn_src
+ * is non-NULL.
+ *
+ * Return: true if there was a matching database entry, false otherwise.
+ */
+bool kdbus_match_db_match_msg(struct kdbus_match_db *mdb,
+			      struct kdbus_conn *conn_src,
+			      const struct kdbus_staging *staging)
+{
+	struct kdbus_match_entry *entry;
+	bool matched = false;
+
+	down_read(&mdb->mdb_rwlock);
+	list_for_each_entry(entry, &mdb->entries_list, list_entry) {
+		matched = kdbus_match_rules(entry, conn_src, staging);
+		if (matched)
+			break;
+	}
+	up_read(&mdb->mdb_rwlock);
+
+	return matched;
+}
+
+static int kdbus_match_db_remove_unlocked(struct kdbus_match_db *mdb,
+					  u64 cookie)
+{
+	struct kdbus_match_entry *entry, *tmp;
+	bool found = false;
+
+	list_for_each_entry_safe(entry, tmp, &mdb->entries_list, list_entry)
+		if (entry->cookie == cookie) {
+			kdbus_match_entry_free(entry);
+			--mdb->entries_count;
+			found = true;
+		}
+
+	return found ? 0 : -EBADSLT;
+}
+
+/**
+ * kdbus_cmd_match_add() - handle KDBUS_CMD_MATCH_ADD
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * One call to this function (or one ioctl(KDBUS_CMD_MATCH_ADD), respectively,
+ * adds one new database entry with n rules attached to it. Each rule is
+ * described with an kdbus_item, and an entry is considered matching if all
+ * its rules are satisfied.
+ *
+ * The items attached to a kdbus_cmd_match struct have the following mapping:
+ *
+ * KDBUS_ITEM_BLOOM_MASK:	A bloom mask
+ * KDBUS_ITEM_NAME:		A connection's source name
+ * KDBUS_ITEM_ID:		A connection ID
+ * KDBUS_ITEM_DST_ID:		A connection ID
+ * KDBUS_ITEM_NAME_ADD:
+ * KDBUS_ITEM_NAME_REMOVE:
+ * KDBUS_ITEM_NAME_CHANGE:	Well-known name changes, carry
+ *				kdbus_notify_name_change
+ * KDBUS_ITEM_ID_ADD:
+ * KDBUS_ITEM_ID_REMOVE:	Connection ID changes, carry
+ *				kdbus_notify_id_change
+ *
+ * For kdbus_notify_{id,name}_change structs, only the ID and name fields
+ * are looked at when adding an entry. The flags are unused.
+ *
+ * Also note that KDBUS_ITEM_BLOOM_MASK, KDBUS_ITEM_NAME, KDBUS_ITEM_ID,
+ * and KDBUS_ITEM_DST_ID are used to match messages from userspace, while the
+ * others apply to kernel-generated notifications.
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_match_add(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_match_db *mdb = conn->match_db;
+	struct kdbus_match_entry *entry = NULL;
+	struct kdbus_cmd_match *cmd;
+	struct kdbus_item *item;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_BLOOM_MASK, .multiple = true },
+		{ .type = KDBUS_ITEM_NAME, .multiple = true },
+		{ .type = KDBUS_ITEM_ID, .multiple = true },
+		{ .type = KDBUS_ITEM_DST_ID, .multiple = true },
+		{ .type = KDBUS_ITEM_NAME_ADD, .multiple = true },
+		{ .type = KDBUS_ITEM_NAME_REMOVE, .multiple = true },
+		{ .type = KDBUS_ITEM_NAME_CHANGE, .multiple = true },
+		{ .type = KDBUS_ITEM_ID_ADD, .multiple = true },
+		{ .type = KDBUS_ITEM_ID_REMOVE, .multiple = true },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE |
+				 KDBUS_MATCH_REPLACE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	if (!kdbus_conn_is_ordinary(conn))
+		return -EOPNOTSUPP;
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	entry->cookie = cmd->cookie;
+	INIT_LIST_HEAD(&entry->list_entry);
+	INIT_LIST_HEAD(&entry->rules_list);
+
+	KDBUS_ITEMS_FOREACH(item, cmd->items, KDBUS_ITEMS_SIZE(cmd, items)) {
+		struct kdbus_match_rule *rule;
+		size_t size = item->size - offsetof(struct kdbus_item, data);
+
+		rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+		if (!rule) {
+			ret = -ENOMEM;
+			goto exit;
+		}
+
+		rule->type = item->type;
+		INIT_LIST_HEAD(&rule->rules_entry);
+
+		switch (item->type) {
+		case KDBUS_ITEM_BLOOM_MASK: {
+			u64 bsize = conn->ep->bus->bloom.size;
+			u64 generations;
+			u64 remainder;
+
+			generations = div64_u64_rem(size, bsize, &remainder);
+			if (size < bsize || remainder > 0) {
+				ret = -EDOM;
+				break;
+			}
+
+			rule->bloom_mask.data = kmemdup(item->data,
+							size, GFP_KERNEL);
+			if (!rule->bloom_mask.data) {
+				ret = -ENOMEM;
+				break;
+			}
+
+			rule->bloom_mask.generations = generations;
+			break;
+		}
+
+		case KDBUS_ITEM_NAME:
+			if (!kdbus_name_is_valid(item->str, false)) {
+				ret = -EINVAL;
+				break;
+			}
+
+			rule->name = kstrdup(item->str, GFP_KERNEL);
+			if (!rule->name)
+				ret = -ENOMEM;
+
+			break;
+
+		case KDBUS_ITEM_ID:
+			rule->src_id = item->id;
+			break;
+
+		case KDBUS_ITEM_DST_ID:
+			rule->dst_id = item->id;
+			break;
+
+		case KDBUS_ITEM_NAME_ADD:
+		case KDBUS_ITEM_NAME_REMOVE:
+		case KDBUS_ITEM_NAME_CHANGE:
+			rule->old_id = item->name_change.old_id.id;
+			rule->new_id = item->name_change.new_id.id;
+
+			if (size > sizeof(struct kdbus_notify_name_change)) {
+				rule->name = kstrdup(item->name_change.name,
+						     GFP_KERNEL);
+				if (!rule->name)
+					ret = -ENOMEM;
+			}
+
+			break;
+
+		case KDBUS_ITEM_ID_ADD:
+		case KDBUS_ITEM_ID_REMOVE:
+			if (item->type == KDBUS_ITEM_ID_ADD)
+				rule->new_id = item->id_change.id;
+			else
+				rule->old_id = item->id_change.id;
+
+			break;
+		}
+
+		if (ret < 0) {
+			kdbus_match_rule_free(rule);
+			goto exit;
+		}
+
+		list_add_tail(&rule->rules_entry, &entry->rules_list);
+	}
+
+	down_write(&mdb->mdb_rwlock);
+
+	/* Remove any entry that has the same cookie as the current one. */
+	if (cmd->flags & KDBUS_MATCH_REPLACE)
+		kdbus_match_db_remove_unlocked(mdb, entry->cookie);
+
+	/*
+	 * If the above removal caught any entry, there will be room for the
+	 * new one.
+	 */
+	if (++mdb->entries_count > KDBUS_MATCH_MAX) {
+		--mdb->entries_count;
+		ret = -EMFILE;
+	} else {
+		list_add_tail(&entry->list_entry, &mdb->entries_list);
+		entry = NULL;
+	}
+
+	up_write(&mdb->mdb_rwlock);
+
+exit:
+	kdbus_match_entry_free(entry);
+	return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_match_remove() - handle KDBUS_CMD_MATCH_REMOVE
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_match_remove(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_cmd_match *cmd;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	if (!kdbus_conn_is_ordinary(conn))
+		return -EOPNOTSUPP;
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	down_write(&conn->match_db->mdb_rwlock);
+	ret = kdbus_match_db_remove_unlocked(conn->match_db, cmd->cookie);
+	up_write(&conn->match_db->mdb_rwlock);
+
+	return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/match.h b/ipc/kdbus/match.h
new file mode 100644
index 000000000..ceb492f8e
--- /dev/null
+++ b/ipc/kdbus/match.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_MATCH_H
+#define __KDBUS_MATCH_H
+
+struct kdbus_conn;
+struct kdbus_match_db;
+struct kdbus_staging;
+
+struct kdbus_match_db *kdbus_match_db_new(void);
+void kdbus_match_db_free(struct kdbus_match_db *db);
+int kdbus_match_db_add(struct kdbus_conn *conn,
+		       struct kdbus_cmd_match *cmd);
+int kdbus_match_db_remove(struct kdbus_conn *conn,
+			  struct kdbus_cmd_match *cmd);
+bool kdbus_match_db_match_msg(struct kdbus_match_db *db,
+			      struct kdbus_conn *conn_src,
+			      const struct kdbus_staging *staging);
+
+int kdbus_cmd_match_add(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_match_remove(struct kdbus_conn *conn, void __user *argp);
+
+#endif
diff --git a/ipc/kdbus/message.c b/ipc/kdbus/message.c
new file mode 100644
index 000000000..432dba4dc
--- /dev/null
+++ b/ipc/kdbus/message.c
@@ -0,0 +1,1040 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/shmem_fs.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <net/sock.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "match.h"
+#include "message.h"
+#include "names.h"
+#include "policy.h"
+
+static const char * const zeros = "\0\0\0\0\0\0\0";
+
+static struct kdbus_gaps *kdbus_gaps_new(size_t n_memfds, size_t n_fds)
+{
+	size_t size_offsets, size_memfds, size_fds, size;
+	struct kdbus_gaps *gaps;
+
+	size_offsets = n_memfds * sizeof(*gaps->memfd_offsets);
+	size_memfds = n_memfds * sizeof(*gaps->memfd_files);
+	size_fds = n_fds * sizeof(*gaps->fd_files);
+	size = sizeof(*gaps) + size_offsets + size_memfds + size_fds;
+
+	gaps = kzalloc(size, GFP_KERNEL);
+	if (!gaps)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&gaps->kref);
+	gaps->n_memfds = 0; /* we reserve n_memfds, but don't enforce them */
+	gaps->memfd_offsets = (void *)(gaps + 1);
+	gaps->memfd_files = (void *)((u8 *)gaps->memfd_offsets + size_offsets);
+	gaps->n_fds = 0; /* we reserve n_fds, but don't enforce them */
+	gaps->fd_files = (void *)((u8 *)gaps->memfd_files + size_memfds);
+
+	return gaps;
+}
+
+static void kdbus_gaps_free(struct kref *kref)
+{
+	struct kdbus_gaps *gaps = container_of(kref, struct kdbus_gaps, kref);
+	size_t i;
+
+	for (i = 0; i < gaps->n_fds; ++i)
+		if (gaps->fd_files[i])
+			fput(gaps->fd_files[i]);
+	for (i = 0; i < gaps->n_memfds; ++i)
+		if (gaps->memfd_files[i])
+			fput(gaps->memfd_files[i]);
+
+	kfree(gaps);
+}
+
+/**
+ * kdbus_gaps_ref() - gain reference
+ * @gaps:	gaps object
+ *
+ * Return: @gaps is returned
+ */
+struct kdbus_gaps *kdbus_gaps_ref(struct kdbus_gaps *gaps)
+{
+	if (gaps)
+		kref_get(&gaps->kref);
+	return gaps;
+}
+
+/**
+ * kdbus_gaps_unref() - drop reference
+ * @gaps:	gaps object
+ *
+ * Return: NULL
+ */
+struct kdbus_gaps *kdbus_gaps_unref(struct kdbus_gaps *gaps)
+{
+	if (gaps)
+		kref_put(&gaps->kref, kdbus_gaps_free);
+	return NULL;
+}
+
+/**
+ * kdbus_gaps_install() - install file-descriptors
+ * @gaps:		gaps object, or NULL
+ * @slice:		pool slice that contains the message
+ * @out_incomplete	output variable to note incomplete fds
+ *
+ * This function installs all file-descriptors of @gaps into the current
+ * process and copies the file-descriptor numbers into the target pool slice.
+ *
+ * If the file-descriptors were only partially installed, then @out_incomplete
+ * will be set to true. Otherwise, it's set to false.
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int kdbus_gaps_install(struct kdbus_gaps *gaps, struct kdbus_pool_slice *slice,
+		       bool *out_incomplete)
+{
+	bool incomplete_fds = false;
+	struct kvec kvec;
+	size_t i, n_fds;
+	int ret, *fds;
+
+	if (!gaps) {
+		/* nothing to do */
+		*out_incomplete = incomplete_fds;
+		return 0;
+	}
+
+	n_fds = gaps->n_fds + gaps->n_memfds;
+	if (n_fds < 1) {
+		/* nothing to do */
+		*out_incomplete = incomplete_fds;
+		return 0;
+	}
+
+	fds = kmalloc_array(n_fds, sizeof(*fds), GFP_TEMPORARY);
+	n_fds = 0;
+	if (!fds)
+		return -ENOMEM;
+
+	/* 1) allocate fds and copy them over */
+
+	if (gaps->n_fds > 0) {
+		for (i = 0; i < gaps->n_fds; ++i) {
+			int fd;
+
+			fd = get_unused_fd_flags(O_CLOEXEC);
+			if (fd < 0)
+				incomplete_fds = true;
+
+			WARN_ON(!gaps->fd_files[i]);
+
+			fds[n_fds++] = fd < 0 ? -1 : fd;
+		}
+
+		/*
+		 * The file-descriptor array can only be present once per
+		 * message. Hence, prepare all fds and then copy them over with
+		 * a single kvec.
+		 */
+
+		WARN_ON(!gaps->fd_offset);
+
+		kvec.iov_base = fds;
+		kvec.iov_len = gaps->n_fds * sizeof(*fds);
+		ret = kdbus_pool_slice_copy_kvec(slice, gaps->fd_offset,
+						 &kvec, 1, kvec.iov_len);
+		if (ret < 0)
+			goto exit;
+	}
+
+	for (i = 0; i < gaps->n_memfds; ++i) {
+		int memfd;
+
+		memfd = get_unused_fd_flags(O_CLOEXEC);
+		if (memfd < 0) {
+			incomplete_fds = true;
+			/* memfds are initialized to -1, skip copying it */
+			continue;
+		}
+
+		fds[n_fds++] = memfd;
+
+		/*
+		 * memfds have to be copied individually as they each are put
+		 * into a separate item. This should not be an issue, though,
+		 * as usually there is no need to send more than one memfd per
+		 * message.
+		 */
+
+		WARN_ON(!gaps->memfd_offsets[i]);
+		WARN_ON(!gaps->memfd_files[i]);
+
+		kvec.iov_base = &memfd;
+		kvec.iov_len = sizeof(memfd);
+		ret = kdbus_pool_slice_copy_kvec(slice, gaps->memfd_offsets[i],
+						 &kvec, 1, kvec.iov_len);
+		if (ret < 0)
+			goto exit;
+	}
+
+	/* 2) install fds now that everything was successful */
+
+	for (i = 0; i < gaps->n_fds; ++i)
+		if (fds[i] >= 0)
+			fd_install(fds[i], get_file(gaps->fd_files[i]));
+	for (i = 0; i < gaps->n_memfds; ++i)
+		if (fds[gaps->n_fds + i] >= 0)
+			fd_install(fds[gaps->n_fds + i],
+				   get_file(gaps->memfd_files[i]));
+
+	ret = 0;
+
+exit:
+	if (ret < 0)
+		for (i = 0; i < n_fds; ++i)
+			put_unused_fd(fds[i]);
+	kfree(fds);
+	*out_incomplete = incomplete_fds;
+	return ret;
+}
+
+static struct file *kdbus_get_fd(int fd)
+{
+	struct file *f, *ret;
+	struct inode *inode;
+	struct socket *sock;
+
+	if (fd < 0)
+		return ERR_PTR(-EBADF);
+
+	f = fget_raw(fd);
+	if (!f)
+		return ERR_PTR(-EBADF);
+
+	inode = file_inode(f);
+	sock = S_ISSOCK(inode->i_mode) ? SOCKET_I(inode) : NULL;
+
+	if (f->f_mode & FMODE_PATH)
+		ret = f; /* O_PATH is always allowed */
+	else if (f->f_op == &kdbus_handle_ops)
+		ret = ERR_PTR(-EOPNOTSUPP); /* disallow kdbus-fd over kdbus */
+	else if (sock && sock->sk && sock->ops && sock->ops->family == PF_UNIX)
+		ret = ERR_PTR(-EOPNOTSUPP); /* disallow UDS over kdbus */
+	else
+		ret = f; /* all other are allowed */
+
+	if (f != ret)
+		fput(f);
+
+	return ret;
+}
+
+static struct file *kdbus_get_memfd(const struct kdbus_memfd *memfd)
+{
+	const int m = F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_SEAL;
+	struct file *f, *ret;
+	int s;
+
+	if (memfd->fd < 0)
+		return ERR_PTR(-EBADF);
+
+	f = fget(memfd->fd);
+	if (!f)
+		return ERR_PTR(-EBADF);
+
+	s = shmem_get_seals(f);
+	if (s < 0)
+		ret = ERR_PTR(-EMEDIUMTYPE);
+	else if ((s & m) != m)
+		ret = ERR_PTR(-ETXTBSY);
+	else if (memfd->start + memfd->size > (u64)i_size_read(file_inode(f)))
+		ret = ERR_PTR(-EFAULT);
+	else
+		ret = f;
+
+	if (f != ret)
+		fput(f);
+
+	return ret;
+}
+
+static int kdbus_msg_examine(struct kdbus_msg *msg, struct kdbus_bus *bus,
+			     struct kdbus_cmd_send *cmd, size_t *out_n_memfds,
+			     size_t *out_n_fds, size_t *out_n_parts)
+{
+	struct kdbus_item *item, *fds = NULL, *bloom = NULL, *dstname = NULL;
+	u64 n_parts, n_memfds, n_fds, vec_size;
+
+	/*
+	 * Step 1:
+	 * Validate the message and command parameters.
+	 */
+
+	/* KDBUS_PAYLOAD_KERNEL is reserved to kernel messages */
+	if (msg->payload_type == KDBUS_PAYLOAD_KERNEL)
+		return -EINVAL;
+
+	if (msg->dst_id == KDBUS_DST_ID_BROADCAST) {
+		/* broadcasts must be marked as signals */
+		if (!(msg->flags & KDBUS_MSG_SIGNAL))
+			return -EBADMSG;
+		/* broadcasts cannot have timeouts */
+		if (msg->timeout_ns > 0)
+			return -ENOTUNIQ;
+	}
+
+	if (msg->flags & KDBUS_MSG_EXPECT_REPLY) {
+		/* if you expect a reply, you must specify a timeout */
+		if (msg->timeout_ns == 0)
+			return -EINVAL;
+		/* signals cannot have replies */
+		if (msg->flags & KDBUS_MSG_SIGNAL)
+			return -ENOTUNIQ;
+	} else {
+		/* must expect reply if sent as synchronous call */
+		if (cmd->flags & KDBUS_SEND_SYNC_REPLY)
+			return -EINVAL;
+		/* cannot mark replies as signal */
+		if (msg->cookie_reply && (msg->flags & KDBUS_MSG_SIGNAL))
+			return -EINVAL;
+	}
+
+	/*
+	 * Step 2:
+	 * Validate all passed items. While at it, select some statistics that
+	 * are required to allocate state objects later on.
+	 *
+	 * Generic item validation has already been done via
+	 * kdbus_item_validate(). Furthermore, the number of items is naturally
+	 * limited by the maximum message size. Hence, only non-generic item
+	 * checks are performed here (mainly integer overflow tests).
+	 */
+
+	n_parts = 0;
+	n_memfds = 0;
+	n_fds = 0;
+	vec_size = 0;
+
+	KDBUS_ITEMS_FOREACH(item, msg->items, KDBUS_ITEMS_SIZE(msg, items)) {
+		switch (item->type) {
+		case KDBUS_ITEM_PAYLOAD_VEC: {
+			void __force __user *ptr = KDBUS_PTR(item->vec.address);
+			u64 size = item->vec.size;
+
+			if (vec_size + size < vec_size)
+				return -EMSGSIZE;
+			if (vec_size + size > KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE)
+				return -EMSGSIZE;
+			if (ptr && unlikely(!access_ok(VERIFY_READ, ptr, size)))
+				return -EFAULT;
+
+			if (ptr || size % 8) /* data or padding */
+				++n_parts;
+			break;
+		}
+		case KDBUS_ITEM_PAYLOAD_MEMFD: {
+			u64 start = item->memfd.start;
+			u64 size = item->memfd.size;
+
+			if (start + size < start)
+				return -EMSGSIZE;
+			if (n_memfds >= KDBUS_MSG_MAX_MEMFD_ITEMS)
+				return -E2BIG;
+
+			++n_memfds;
+			if (size % 8) /* vec-padding required */
+				++n_parts;
+			break;
+		}
+		case KDBUS_ITEM_FDS: {
+			if (fds)
+				return -EEXIST;
+
+			fds = item;
+			n_fds = KDBUS_ITEM_PAYLOAD_SIZE(item) / sizeof(int);
+			if (n_fds > KDBUS_CONN_MAX_FDS_PER_USER)
+				return -EMFILE;
+
+			break;
+		}
+		case KDBUS_ITEM_BLOOM_FILTER: {
+			u64 bloom_size;
+
+			if (bloom)
+				return -EEXIST;
+
+			bloom = item;
+			bloom_size = KDBUS_ITEM_PAYLOAD_SIZE(item) -
+				     offsetof(struct kdbus_bloom_filter, data);
+			if (!KDBUS_IS_ALIGNED8(bloom_size))
+				return -EFAULT;
+			if (bloom_size != bus->bloom.size)
+				return -EDOM;
+
+			break;
+		}
+		case KDBUS_ITEM_DST_NAME: {
+			if (dstname)
+				return -EEXIST;
+
+			dstname = item;
+			if (!kdbus_name_is_valid(item->str, false))
+				return -EINVAL;
+			if (msg->dst_id == KDBUS_DST_ID_BROADCAST)
+				return -EBADMSG;
+
+			break;
+		}
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Step 3:
+	 * Validate that required items were actually passed, and that no item
+	 * contradicts the message flags.
+	 */
+
+	/* bloom filters must be attached _iff_ it's a signal */
+	if (!(msg->flags & KDBUS_MSG_SIGNAL) != !bloom)
+		return -EBADMSG;
+	/* destination name is required if no ID is given */
+	if (msg->dst_id == KDBUS_DST_ID_NAME && !dstname)
+		return -EDESTADDRREQ;
+	/* cannot send file-descriptors attached to broadcasts */
+	if (msg->dst_id == KDBUS_DST_ID_BROADCAST && fds)
+		return -ENOTUNIQ;
+
+	*out_n_memfds = n_memfds;
+	*out_n_fds = n_fds;
+	*out_n_parts = n_parts;
+
+	return 0;
+}
+
+static bool kdbus_staging_merge_vecs(struct kdbus_staging *staging,
+				     struct kdbus_item **prev_item,
+				     struct iovec **prev_vec,
+				     const struct kdbus_item *merge)
+{
+	void __user *ptr = (void __user *)KDBUS_PTR(merge->vec.address);
+	u64 padding = merge->vec.size % 8;
+	struct kdbus_item *prev = *prev_item;
+	struct iovec *vec = *prev_vec;
+
+	/* XXX: merging is disabled so far */
+	if (0 && prev && prev->type == KDBUS_ITEM_PAYLOAD_OFF &&
+	    !merge->vec.address == !prev->vec.address) {
+		/*
+		 * If we merge two VECs, we can always drop the second
+		 * PAYLOAD_VEC item. Hence, include its size in the previous
+		 * one.
+		 */
+		prev->vec.size += merge->vec.size;
+
+		if (ptr) {
+			/*
+			 * If we merge two data VECs, we need two iovecs to copy
+			 * the data. But the items can be easily merged by
+			 * summing their lengths.
+			 */
+			vec = &staging->parts[staging->n_parts++];
+			vec->iov_len = merge->vec.size;
+			vec->iov_base = ptr;
+			staging->n_payload += vec->iov_len;
+		} else if (padding) {
+			/*
+			 * If we merge two 0-vecs with the second 0-vec
+			 * requiring padding, we need to insert an iovec to copy
+			 * the 0-padding. We try merging it with the previous
+			 * 0-padding iovec. This might end up with an
+			 * iov_len==0, in which case we simply drop the iovec.
+			 */
+			if (vec) {
+				staging->n_payload -= vec->iov_len;
+				vec->iov_len = prev->vec.size % 8;
+				if (!vec->iov_len) {
+					--staging->n_parts;
+					vec = NULL;
+				} else {
+					staging->n_payload += vec->iov_len;
+				}
+			} else {
+				vec = &staging->parts[staging->n_parts++];
+				vec->iov_len = padding;
+				vec->iov_base = (char __user *)zeros;
+				staging->n_payload += vec->iov_len;
+			}
+		} else {
+			/*
+			 * If we merge two 0-vecs with the second 0-vec having
+			 * no padding, we know the padding of the first stays
+			 * the same. Hence, @vec needs no adjustment.
+			 */
+		}
+
+		/* successfully merged with previous item */
+		merge = prev;
+	} else {
+		/*
+		 * If we cannot merge the payload item with the previous one,
+		 * we simply insert a new iovec for the data/padding.
+		 */
+		if (ptr) {
+			vec = &staging->parts[staging->n_parts++];
+			vec->iov_len = merge->vec.size;
+			vec->iov_base = ptr;
+			staging->n_payload += vec->iov_len;
+		} else if (padding) {
+			vec = &staging->parts[staging->n_parts++];
+			vec->iov_len = padding;
+			vec->iov_base = (char __user *)zeros;
+			staging->n_payload += vec->iov_len;
+		} else {
+			vec = NULL;
+		}
+	}
+
+	*prev_item = (struct kdbus_item *)merge;
+	*prev_vec = vec;
+
+	return merge == prev;
+}
+
+static int kdbus_staging_import(struct kdbus_staging *staging)
+{
+	struct kdbus_item *it, *item, *last, *prev_payload;
+	struct kdbus_gaps *gaps = staging->gaps;
+	struct kdbus_msg *msg = staging->msg;
+	struct iovec *part, *prev_part;
+	bool drop_item;
+
+	drop_item = false;
+	last = NULL;
+	prev_payload = NULL;
+	prev_part = NULL;
+
+	/*
+	 * We modify msg->items along the way; make sure to use @item as offset
+	 * to the next item (instead of the iterator @it).
+	 */
+	for (it = item = msg->items;
+	     it >= msg->items &&
+	             (u8 *)it < (u8 *)msg + msg->size &&
+	             (u8 *)it + it->size <= (u8 *)msg + msg->size; ) {
+		/*
+		 * If we dropped items along the way, move current item to
+		 * front. We must not access @it afterwards, but use @item
+		 * instead!
+		 */
+		if (it != item)
+			memmove(item, it, it->size);
+		it = (void *)((u8 *)it + KDBUS_ALIGN8(item->size));
+
+		switch (item->type) {
+		case KDBUS_ITEM_PAYLOAD_VEC: {
+			size_t offset = staging->n_payload;
+
+			if (kdbus_staging_merge_vecs(staging, &prev_payload,
+						     &prev_part, item)) {
+				drop_item = true;
+			} else if (item->vec.address) {
+				/* real offset is patched later on */
+				item->type = KDBUS_ITEM_PAYLOAD_OFF;
+				item->vec.offset = offset;
+			} else {
+				item->type = KDBUS_ITEM_PAYLOAD_OFF;
+				item->vec.offset = ~0ULL;
+			}
+
+			break;
+		}
+		case KDBUS_ITEM_PAYLOAD_MEMFD: {
+			struct file *f;
+
+			f = kdbus_get_memfd(&item->memfd);
+			if (IS_ERR(f))
+				return PTR_ERR(f);
+
+			gaps->memfd_files[gaps->n_memfds] = f;
+			gaps->memfd_offsets[gaps->n_memfds] =
+					(u8 *)&item->memfd.fd - (u8 *)msg;
+			++gaps->n_memfds;
+
+			/* memfds cannot be merged */
+			prev_payload = item;
+			prev_part = NULL;
+
+			/* insert padding to make following VECs aligned */
+			if (item->memfd.size % 8) {
+				part = &staging->parts[staging->n_parts++];
+				part->iov_len = item->memfd.size % 8;
+				part->iov_base = (char __user *)zeros;
+				staging->n_payload += part->iov_len;
+			}
+
+			break;
+		}
+		case KDBUS_ITEM_FDS: {
+			size_t i, n_fds;
+
+			n_fds = KDBUS_ITEM_PAYLOAD_SIZE(item) / sizeof(int);
+			for (i = 0; i < n_fds; ++i) {
+				struct file *f;
+
+				f = kdbus_get_fd(item->fds[i]);
+				if (IS_ERR(f))
+					return PTR_ERR(f);
+
+				gaps->fd_files[gaps->n_fds++] = f;
+			}
+
+			gaps->fd_offset = (u8 *)item->fds - (u8 *)msg;
+
+			break;
+		}
+		case KDBUS_ITEM_BLOOM_FILTER:
+			staging->bloom_filter = &item->bloom_filter;
+			break;
+		case KDBUS_ITEM_DST_NAME:
+			staging->dst_name = item->str;
+			break;
+		}
+
+		/* drop item if we merged it with a previous one */
+		if (drop_item) {
+			drop_item = false;
+		} else {
+			last = item;
+			item = KDBUS_ITEM_NEXT(item);
+		}
+	}
+
+	/* adjust message size regarding dropped items */
+	msg->size = offsetof(struct kdbus_msg, items);
+	if (last)
+		msg->size += ((u8 *)last - (u8 *)msg->items) + last->size;
+
+	return 0;
+}
+
+static void kdbus_staging_reserve(struct kdbus_staging *staging)
+{
+	struct iovec *part;
+
+	part = &staging->parts[staging->n_parts++];
+	part->iov_base = (void __user *)zeros;
+	part->iov_len = 0;
+}
+
+static struct kdbus_staging *kdbus_staging_new(struct kdbus_bus *bus,
+					       size_t n_parts,
+					       size_t msg_extra_size)
+{
+	const size_t reserved_parts = 5; /* see below for explanation */
+	struct kdbus_staging *staging;
+	int ret;
+
+	n_parts += reserved_parts;
+
+	staging = kzalloc(sizeof(*staging) + n_parts * sizeof(*staging->parts) +
+			  msg_extra_size, GFP_TEMPORARY);
+	if (!staging)
+		return ERR_PTR(-ENOMEM);
+
+	staging->msg_seqnum = atomic64_inc_return(&bus->domain->last_id);
+	staging->n_parts = 0; /* we reserve n_parts, but don't enforce them */
+	staging->parts = (void *)(staging + 1);
+
+	if (msg_extra_size) /* if requested, allocate message, too */
+		staging->msg = (void *)((u8 *)staging->parts +
+				        n_parts * sizeof(*staging->parts));
+
+	staging->meta_proc = kdbus_meta_proc_new();
+	if (IS_ERR(staging->meta_proc)) {
+		ret = PTR_ERR(staging->meta_proc);
+		staging->meta_proc = NULL;
+		goto error;
+	}
+
+	staging->meta_conn = kdbus_meta_conn_new();
+	if (IS_ERR(staging->meta_conn)) {
+		ret = PTR_ERR(staging->meta_conn);
+		staging->meta_conn = NULL;
+		goto error;
+	}
+
+	/*
+	 * Prepare iovecs to copy the message into the target pool. We use the
+	 * following iovecs:
+	 *   * iovec to copy "kdbus_msg.size"
+	 *   * iovec to copy "struct kdbus_msg" (minus size) plus items
+	 *   * iovec for possible padding after the items
+	 *   * iovec for metadata items
+	 *   * iovec for possible padding after the items
+	 *
+	 * Make sure to update @reserved_parts if you add more parts here.
+	 */
+
+	kdbus_staging_reserve(staging); /* msg.size */
+	kdbus_staging_reserve(staging); /* msg (minus msg.size) plus items */
+	kdbus_staging_reserve(staging); /* msg padding */
+	kdbus_staging_reserve(staging); /* meta */
+	kdbus_staging_reserve(staging); /* meta padding */
+
+	return staging;
+
+error:
+	kdbus_staging_free(staging);
+	return ERR_PTR(ret);
+}
+
+struct kdbus_staging *kdbus_staging_new_kernel(struct kdbus_bus *bus,
+					       u64 dst, u64 cookie_timeout,
+					       size_t it_size, size_t it_type)
+{
+	struct kdbus_staging *staging;
+	size_t size;
+
+	size = offsetof(struct kdbus_msg, items) +
+	       KDBUS_ITEM_HEADER_SIZE + it_size;
+
+	staging = kdbus_staging_new(bus, 0, KDBUS_ALIGN8(size));
+	if (IS_ERR(staging))
+		return ERR_CAST(staging);
+
+	staging->msg->size = size;
+	staging->msg->flags = (dst == KDBUS_DST_ID_BROADCAST) ?
+							KDBUS_MSG_SIGNAL : 0;
+	staging->msg->dst_id = dst;
+	staging->msg->src_id = KDBUS_SRC_ID_KERNEL;
+	staging->msg->payload_type = KDBUS_PAYLOAD_KERNEL;
+	staging->msg->cookie_reply = cookie_timeout;
+	staging->notify = staging->msg->items;
+	staging->notify->size = KDBUS_ITEM_HEADER_SIZE + it_size;
+	staging->notify->type = it_type;
+
+	return staging;
+}
+
+struct kdbus_staging *kdbus_staging_new_user(struct kdbus_bus *bus,
+					     struct kdbus_cmd_send *cmd,
+					     struct kdbus_msg *msg)
+{
+	const size_t reserved_parts = 1; /* see below for explanation */
+	size_t n_memfds, n_fds, n_parts;
+	struct kdbus_staging *staging;
+	int ret;
+
+	/*
+	 * Examine user-supplied message and figure out how many resources we
+	 * need to allocate in our staging area. This requires us to iterate
+	 * the message twice, but saves us from re-allocating our resources
+	 * all the time.
+	 */
+
+	ret = kdbus_msg_examine(msg, bus, cmd, &n_memfds, &n_fds, &n_parts);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	n_parts += reserved_parts;
+
+	/*
+	 * Allocate staging area with the number of required resources. Make
+	 * sure that we have enough iovecs for all required parts pre-allocated
+	 * so this will hopefully be the only memory allocation for this
+	 * message transaction.
+	 */
+
+	staging = kdbus_staging_new(bus, n_parts, 0);
+	if (IS_ERR(staging))
+		return ERR_CAST(staging);
+
+	staging->msg = msg;
+
+	/*
+	 * If the message contains memfds or fd items, we need to remember some
+	 * state so we can fill in the requested information at RECV time.
+	 * File-descriptors cannot be passed at SEND time. Hence, allocate a
+	 * gaps-object to remember that state. That gaps object is linked to
+	 * from the staging area, but will also be linked to from the message
+	 * queue of each peer. Hence, each receiver owns a reference to it, and
+	 * it will later be used to fill the 'gaps' in message that couldn't be
+	 * filled at SEND time.
+	 * Note that the 'gaps' object is read-only once the staging-allocator
+	 * returns. There might be connections receiving a queued message while
+	 * the sender still broadcasts the message to other receivers.
+	 */
+
+	if (n_memfds > 0 || n_fds > 0) {
+		staging->gaps = kdbus_gaps_new(n_memfds, n_fds);
+		if (IS_ERR(staging->gaps)) {
+			ret = PTR_ERR(staging->gaps);
+			staging->gaps = NULL;
+			kdbus_staging_free(staging);
+			return ERR_PTR(ret);
+		}
+	}
+
+	/*
+	 * kdbus_staging_new() already reserves parts for message setup. For
+	 * user-supplied messages, we add the following iovecs:
+	 *   ... variable number of iovecs for payload ...
+	 *   * final iovec for possible padding of payload
+	 *
+	 * Make sure to update @reserved_parts if you add more parts here.
+	 */
+
+	ret = kdbus_staging_import(staging); /* payload */
+	kdbus_staging_reserve(staging); /* payload padding */
+
+	if (ret < 0)
+		goto error;
+
+	return staging;
+
+error:
+	kdbus_staging_free(staging);
+	return ERR_PTR(ret);
+}
+
+struct kdbus_staging *kdbus_staging_free(struct kdbus_staging *staging)
+{
+	if (!staging)
+		return NULL;
+
+	kdbus_meta_conn_unref(staging->meta_conn);
+	kdbus_meta_proc_unref(staging->meta_proc);
+	kdbus_gaps_unref(staging->gaps);
+	kfree(staging);
+
+	return NULL;
+}
+
+static int kdbus_staging_collect_metadata(struct kdbus_staging *staging,
+					  struct kdbus_conn *src,
+					  struct kdbus_conn *dst,
+					  u64 *out_attach)
+{
+	u64 attach;
+	int ret;
+
+	if (src)
+		attach = kdbus_meta_msg_mask(src, dst);
+	else
+		attach = KDBUS_ATTACH_TIMESTAMP; /* metadata for kernel msgs */
+
+	if (src && !src->meta_fake) {
+		ret = kdbus_meta_proc_collect(staging->meta_proc, attach);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = kdbus_meta_conn_collect(staging->meta_conn, src,
+				      staging->msg_seqnum, attach);
+	if (ret < 0)
+		return ret;
+
+	*out_attach = attach;
+	return 0;
+}
+
+/**
+ * kdbus_staging_emit() - emit linearized message in target pool
+ * @staging:		staging object to create message from
+ * @src:		sender of the message (or NULL)
+ * @dst:		target connection to allocate message for
+ *
+ * This allocates a pool-slice for @dst and copies the message provided by
+ * @staging into it. The new slice is then returned to the caller for further
+ * processing. It's not linked into any queue, yet.
+ *
+ * Return: Newly allocated slice or ERR_PTR on failure.
+ */
+struct kdbus_pool_slice *kdbus_staging_emit(struct kdbus_staging *staging,
+					    struct kdbus_conn *src,
+					    struct kdbus_conn *dst)
+{
+	struct kdbus_item *item, *meta_items = NULL;
+	struct kdbus_pool_slice *slice = NULL;
+	size_t off, size, meta_size;
+	struct iovec *v;
+	u64 attach, msg_size;
+	int ret;
+
+	/*
+	 * Step 1:
+	 * Collect metadata from @src depending on the attach-flags allowed for
+	 * @dst. Translate it into the namespaces pinned by @dst.
+	 */
+
+	ret = kdbus_staging_collect_metadata(staging, src, dst, &attach);
+	if (ret < 0)
+		goto error;
+
+	ret = kdbus_meta_emit(staging->meta_proc, NULL, staging->meta_conn,
+			      dst, attach, &meta_items, &meta_size);
+	if (ret < 0)
+		goto error;
+
+	/*
+	 * Step 2:
+	 * Setup iovecs for the message. See kdbus_staging_new() for allocation
+	 * of those iovecs. All reserved iovecs have been initialized with
+	 * iov_len=0 + iov_base=zeros. Furthermore, the iovecs to copy the
+	 * actual message payload have already been initialized and need not be
+	 * touched.
+	 */
+
+	v = staging->parts;
+	msg_size = staging->msg->size;
+
+	/* msg.size */
+	v->iov_len = sizeof(msg_size);
+	v->iov_base = (void __user *)&msg_size;
+	++v;
+
+	/* msg (after msg.size) plus items */
+	v->iov_len = staging->msg->size - sizeof(staging->msg->size);
+	v->iov_base = (void __user *)((u8 *)staging->msg +
+				      sizeof(staging->msg->size));
+	++v;
+
+	/* padding after msg */
+	v->iov_len = KDBUS_ALIGN8(staging->msg->size) - staging->msg->size;
+	v->iov_base = (void __user *)zeros;
+	++v;
+
+	if (meta_size > 0) {
+		/* metadata items */
+		v->iov_len = meta_size;
+		v->iov_base = (void __user *)meta_items;
+		++v;
+
+		/* padding after metadata */
+		v->iov_len = KDBUS_ALIGN8(meta_size) - meta_size;
+		v->iov_base = (void __user *)zeros;
+		++v;
+
+		msg_size = KDBUS_ALIGN8(msg_size) + meta_size;
+	} else {
+		/* metadata items */
+		v->iov_len = 0;
+		v->iov_base = (void __user *)zeros;
+		++v;
+
+		/* padding after metadata */
+		v->iov_len = 0;
+		v->iov_base = (void __user *)zeros;
+		++v;
+	}
+
+	/* ... payload iovecs are already filled in ... */
+
+	/* compute overall size and fill in padding after payload */
+	size = KDBUS_ALIGN8(msg_size);
+
+	if (staging->n_payload > 0) {
+		size += staging->n_payload;
+
+		v = &staging->parts[staging->n_parts - 1];
+		v->iov_len = KDBUS_ALIGN8(size) - size;
+		v->iov_base = (void __user *)zeros;
+
+		size = KDBUS_ALIGN8(size);
+	}
+
+	/*
+	 * Step 3:
+	 * The PAYLOAD_OFF items in the message contain a relative 'offset'
+	 * field that tells the receiver where to find the actual payload. This
+	 * offset is relative to the start of the message, and as such depends
+	 * on the size of the metadata items we inserted. This size is variable
+	 * and changes for each peer we send the message to. Hence, we remember
+	 * the last relative offset that was used to calculate the 'offset'
+	 * fields. For each message, we re-calculate it and patch all items, in
+	 * case it changed.
+	 */
+
+	off = KDBUS_ALIGN8(msg_size);
+
+	if (off != staging->i_payload) {
+		KDBUS_ITEMS_FOREACH(item, staging->msg->items,
+				    KDBUS_ITEMS_SIZE(staging->msg, items)) {
+			if (item->type != KDBUS_ITEM_PAYLOAD_OFF)
+				continue;
+
+			item->vec.offset -= staging->i_payload;
+			item->vec.offset += off;
+		}
+
+		staging->i_payload = off;
+	}
+
+	/*
+	 * Step 4:
+	 * Allocate pool slice and copy over all data. Make sure to properly
+	 * account on user quota.
+	 */
+
+	ret = kdbus_conn_quota_inc(dst, src ? src->user : NULL, size,
+				   staging->gaps ? staging->gaps->n_fds : 0);
+	if (ret < 0)
+		goto error;
+
+	slice = kdbus_pool_slice_alloc(dst->pool, size, true);
+	if (IS_ERR(slice)) {
+		ret = PTR_ERR(slice);
+		slice = NULL;
+		goto error;
+	}
+
+	WARN_ON(kdbus_pool_slice_size(slice) != size);
+
+	ret = kdbus_pool_slice_copy_iovec(slice, 0, staging->parts,
+					  staging->n_parts, size);
+	if (ret < 0)
+		goto error;
+
+	/* all done, return slice to caller */
+	goto exit;
+
+error:
+	if (slice)
+		kdbus_conn_quota_dec(dst, src ? src->user : NULL, size,
+				     staging->gaps ? staging->gaps->n_fds : 0);
+	kdbus_pool_slice_release(slice);
+	slice = ERR_PTR(ret);
+exit:
+	kfree(meta_items);
+	return slice;
+}
diff --git a/ipc/kdbus/message.h b/ipc/kdbus/message.h
new file mode 100644
index 000000000..298f9c99d
--- /dev/null
+++ b/ipc/kdbus/message.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_MESSAGE_H
+#define __KDBUS_MESSAGE_H
+
+#include <linux/fs.h>
+#include <linux/kref.h>
+#include <uapi/linux/kdbus.h>
+
+struct kdbus_bus;
+struct kdbus_conn;
+struct kdbus_meta_conn;
+struct kdbus_meta_proc;
+struct kdbus_pool_slice;
+
+/**
+ * struct kdbus_gaps - gaps in message to be filled later
+ * @kref:		Reference counter
+ * @n_memfd_offs:	Number of memfds
+ * @memfd_offs:		Offsets of kdbus_memfd items in target slice
+ * @n_fds:		Number of fds
+ * @fds:		Array of sent fds
+ * @fds_offset:		Offset of fd-array in target slice
+ *
+ * The 'gaps' object is used to track data that is needed to fill gaps in a
+ * message at RECV time. Usually, we try to compile the whole message at SEND
+ * time. This has the advantage, that we don't have to cache any information and
+ * can keep the memory consumption small. Furthermore, all copy operations can
+ * be combined into a single function call, which speeds up transactions
+ * considerably.
+ * However, things like file-descriptors can only be fully installed at RECV
+ * time. The gaps object tracks this data and pins it until a message is
+ * received. The gaps object is shared between all receivers of the same
+ * message.
+ */
+struct kdbus_gaps {
+	struct kref kref;
+
+	/* state tracking for KDBUS_ITEM_PAYLOAD_MEMFD entries */
+	size_t n_memfds;
+	u64 *memfd_offsets;
+	struct file **memfd_files;
+
+	/* state tracking for KDBUS_ITEM_FDS */
+	size_t n_fds;
+	struct file **fd_files;
+	u64 fd_offset;
+};
+
+struct kdbus_gaps *kdbus_gaps_ref(struct kdbus_gaps *gaps);
+struct kdbus_gaps *kdbus_gaps_unref(struct kdbus_gaps *gaps);
+int kdbus_gaps_install(struct kdbus_gaps *gaps, struct kdbus_pool_slice *slice,
+		       bool *out_incomplete);
+
+/**
+ * struct kdbus_staging - staging area to import messages
+ * @msg:		User-supplied message
+ * @gaps:		Gaps-object created during import (or NULL if empty)
+ * @msg_seqnum:		Message sequence number
+ * @notify_entry:	Entry into list of kernel-generated notifications
+ * @i_payload:		Current relative index of start of payload
+ * @n_payload:		Total number of bytes needed for payload
+ * @n_parts:		Number of parts
+ * @parts:		Array of iovecs that make up the whole message
+ * @meta_proc:		Process metadata of the sender (or NULL if empty)
+ * @meta_conn:		Connection metadata of the sender (or NULL if empty)
+ * @bloom_filter:	Pointer to the bloom-item in @msg, or NULL
+ * @dst_name:		Pointer to the dst-name-item in @msg, or NULL
+ * @notify:		Pointer to the notification item in @msg, or NULL
+ *
+ * The kdbus_staging object is a temporary staging area to import user-supplied
+ * messages into the kernel. It is only used during SEND and dropped once the
+ * message is queued. Any data that cannot be collected during SEND, is
+ * collected in a kdbus_gaps object and attached to the message queue.
+ */
+struct kdbus_staging {
+	struct kdbus_msg *msg;
+	struct kdbus_gaps *gaps;
+	u64 msg_seqnum;
+	struct list_head notify_entry;
+
+	/* crafted iovecs to copy the message */
+	size_t i_payload;
+	size_t n_payload;
+	size_t n_parts;
+	struct iovec *parts;
+
+	/* metadata state */
+	struct kdbus_meta_proc *meta_proc;
+	struct kdbus_meta_conn *meta_conn;
+
+	/* cached pointers into @msg */
+	const struct kdbus_bloom_filter *bloom_filter;
+	const char *dst_name;
+	struct kdbus_item *notify;
+};
+
+struct kdbus_staging *kdbus_staging_new_kernel(struct kdbus_bus *bus,
+					       u64 dst, u64 cookie_timeout,
+					       size_t it_size, size_t it_type);
+struct kdbus_staging *kdbus_staging_new_user(struct kdbus_bus *bus,
+					     struct kdbus_cmd_send *cmd,
+					     struct kdbus_msg *msg);
+struct kdbus_staging *kdbus_staging_free(struct kdbus_staging *staging);
+struct kdbus_pool_slice *kdbus_staging_emit(struct kdbus_staging *staging,
+					    struct kdbus_conn *src,
+					    struct kdbus_conn *dst);
+
+#endif
diff --git a/ipc/kdbus/metadata.c b/ipc/kdbus/metadata.c
new file mode 100644
index 000000000..d4973a90a
--- /dev/null
+++ b/ipc/kdbus/metadata.c
@@ -0,0 +1,1342 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/cgroup.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fs_struct.h>
+#include <linux/init.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/security.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uidgid.h>
+#include <linux/uio.h>
+#include <linux/user_namespace.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "item.h"
+#include "message.h"
+#include "metadata.h"
+#include "names.h"
+
+/**
+ * struct kdbus_meta_proc - Process metadata
+ * @kref:		Reference counting
+ * @lock:		Object lock
+ * @collected:		Bitmask of collected items
+ * @valid:		Bitmask of collected and valid items
+ * @cred:		Credentials
+ * @pid:		PID of process
+ * @tgid:		TGID of process
+ * @ppid:		PPID of process
+ * @tid_comm:		TID comm line
+ * @pid_comm:		PID comm line
+ * @exe_path:		Executable path
+ * @root_path:		Root-FS path
+ * @cmdline:		Command-line
+ * @cgroup:		Full cgroup path
+ * @seclabel:		Seclabel
+ * @audit_loginuid:	Audit login-UID
+ * @audit_sessionid:	Audit session-ID
+ */
+struct kdbus_meta_proc {
+	struct kref kref;
+	struct mutex lock;
+	u64 collected;
+	u64 valid;
+
+	/* KDBUS_ITEM_CREDS */
+	/* KDBUS_ITEM_AUXGROUPS */
+	/* KDBUS_ITEM_CAPS */
+	const struct cred *cred;
+
+	/* KDBUS_ITEM_PIDS */
+	struct pid *pid;
+	struct pid *tgid;
+	struct pid *ppid;
+
+	/* KDBUS_ITEM_TID_COMM */
+	char tid_comm[TASK_COMM_LEN];
+	/* KDBUS_ITEM_PID_COMM */
+	char pid_comm[TASK_COMM_LEN];
+
+	/* KDBUS_ITEM_EXE */
+	struct path exe_path;
+	struct path root_path;
+
+	/* KDBUS_ITEM_CMDLINE */
+	char *cmdline;
+
+	/* KDBUS_ITEM_CGROUP */
+	char *cgroup;
+
+	/* KDBUS_ITEM_SECLABEL */
+	char *seclabel;
+
+	/* KDBUS_ITEM_AUDIT */
+	kuid_t audit_loginuid;
+	unsigned int audit_sessionid;
+};
+
+/**
+ * struct kdbus_meta_conn
+ * @kref:		Reference counting
+ * @lock:		Object lock
+ * @collected:		Bitmask of collected items
+ * @valid:		Bitmask of collected and valid items
+ * @ts:			Timestamp values
+ * @owned_names_items:	Serialized items for owned names
+ * @owned_names_size:	Size of @owned_names_items
+ * @conn_description:	Connection description
+ */
+struct kdbus_meta_conn {
+	struct kref kref;
+	struct mutex lock;
+	u64 collected;
+	u64 valid;
+
+	/* KDBUS_ITEM_TIMESTAMP */
+	struct kdbus_timestamp ts;
+
+	/* KDBUS_ITEM_OWNED_NAME */
+	struct kdbus_item *owned_names_items;
+	size_t owned_names_size;
+
+	/* KDBUS_ITEM_CONN_DESCRIPTION */
+	char *conn_description;
+};
+
+/* fixed size equivalent of "kdbus_caps" */
+struct kdbus_meta_caps {
+	u32 last_cap;
+	struct {
+		u32 caps[_KERNEL_CAPABILITY_U32S];
+	} set[4];
+};
+
+/**
+ * kdbus_meta_proc_new() - Create process metadata object
+ *
+ * Return: Pointer to new object on success, ERR_PTR on failure.
+ */
+struct kdbus_meta_proc *kdbus_meta_proc_new(void)
+{
+	struct kdbus_meta_proc *mp;
+
+	mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+	if (!mp)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&mp->kref);
+	mutex_init(&mp->lock);
+
+	return mp;
+}
+
+static void kdbus_meta_proc_free(struct kref *kref)
+{
+	struct kdbus_meta_proc *mp = container_of(kref, struct kdbus_meta_proc,
+						  kref);
+
+	path_put(&mp->exe_path);
+	path_put(&mp->root_path);
+	if (mp->cred)
+		put_cred(mp->cred);
+	put_pid(mp->ppid);
+	put_pid(mp->tgid);
+	put_pid(mp->pid);
+
+	kfree(mp->seclabel);
+	kfree(mp->cmdline);
+	kfree(mp->cgroup);
+	kfree(mp);
+}
+
+/**
+ * kdbus_meta_proc_ref() - Gain reference
+ * @mp:		Process metadata object
+ *
+ * Return: @mp is returned
+ */
+struct kdbus_meta_proc *kdbus_meta_proc_ref(struct kdbus_meta_proc *mp)
+{
+	if (mp)
+		kref_get(&mp->kref);
+	return mp;
+}
+
+/**
+ * kdbus_meta_proc_unref() - Drop reference
+ * @mp:		Process metadata object
+ *
+ * Return: NULL
+ */
+struct kdbus_meta_proc *kdbus_meta_proc_unref(struct kdbus_meta_proc *mp)
+{
+	if (mp)
+		kref_put(&mp->kref, kdbus_meta_proc_free);
+	return NULL;
+}
+
+static void kdbus_meta_proc_collect_pids(struct kdbus_meta_proc *mp)
+{
+	struct task_struct *parent;
+
+	mp->pid = get_pid(task_pid(current));
+	mp->tgid = get_pid(task_tgid(current));
+
+	rcu_read_lock();
+	parent = rcu_dereference(current->real_parent);
+	mp->ppid = get_pid(task_tgid(parent));
+	rcu_read_unlock();
+
+	mp->valid |= KDBUS_ATTACH_PIDS;
+}
+
+static void kdbus_meta_proc_collect_tid_comm(struct kdbus_meta_proc *mp)
+{
+	get_task_comm(mp->tid_comm, current);
+	mp->valid |= KDBUS_ATTACH_TID_COMM;
+}
+
+static void kdbus_meta_proc_collect_pid_comm(struct kdbus_meta_proc *mp)
+{
+	get_task_comm(mp->pid_comm, current->group_leader);
+	mp->valid |= KDBUS_ATTACH_PID_COMM;
+}
+
+static void kdbus_meta_proc_collect_exe(struct kdbus_meta_proc *mp)
+{
+	struct file *exe_file;
+
+	rcu_read_lock();
+	exe_file = rcu_dereference(current->mm->exe_file);
+	if (exe_file) {
+		mp->exe_path = exe_file->f_path;
+		path_get(&mp->exe_path);
+		get_fs_root(current->fs, &mp->root_path);
+		mp->valid |= KDBUS_ATTACH_EXE;
+	}
+	rcu_read_unlock();
+}
+
+static int kdbus_meta_proc_collect_cmdline(struct kdbus_meta_proc *mp)
+{
+	struct mm_struct *mm = current->mm;
+	char *cmdline;
+
+	if (!mm->arg_end)
+		return 0;
+
+	cmdline = strndup_user((const char __user *)mm->arg_start,
+			       mm->arg_end - mm->arg_start);
+	if (IS_ERR(cmdline))
+		return PTR_ERR(cmdline);
+
+	mp->cmdline = cmdline;
+	mp->valid |= KDBUS_ATTACH_CMDLINE;
+
+	return 0;
+}
+
+static int kdbus_meta_proc_collect_cgroup(struct kdbus_meta_proc *mp)
+{
+#ifdef CONFIG_CGROUPS
+	void *page;
+	char *s;
+
+	page = (void *)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		return -ENOMEM;
+
+	s = task_cgroup_path(current, page, PAGE_SIZE);
+	if (s) {
+		mp->cgroup = kstrdup(s, GFP_KERNEL);
+		if (!mp->cgroup) {
+			free_page((unsigned long)page);
+			return -ENOMEM;
+		}
+	}
+
+	free_page((unsigned long)page);
+	mp->valid |= KDBUS_ATTACH_CGROUP;
+#endif
+
+	return 0;
+}
+
+static int kdbus_meta_proc_collect_seclabel(struct kdbus_meta_proc *mp)
+{
+#ifdef CONFIG_SECURITY
+	char *ctx = NULL;
+	u32 sid, len;
+	int ret;
+
+	security_task_getsecid(current, &sid);
+	ret = security_secid_to_secctx(sid, &ctx, &len);
+	if (ret < 0) {
+		/*
+		 * EOPNOTSUPP means no security module is active,
+		 * lets skip adding the seclabel then. This effectively
+		 * drops the SECLABEL item.
+		 */
+		return (ret == -EOPNOTSUPP) ? 0 : ret;
+	}
+
+	mp->seclabel = kstrdup(ctx, GFP_KERNEL);
+	security_release_secctx(ctx, len);
+	if (!mp->seclabel)
+		return -ENOMEM;
+
+	mp->valid |= KDBUS_ATTACH_SECLABEL;
+#endif
+
+	return 0;
+}
+
+static void kdbus_meta_proc_collect_audit(struct kdbus_meta_proc *mp)
+{
+#ifdef CONFIG_AUDITSYSCALL
+	mp->audit_loginuid = audit_get_loginuid(current);
+	mp->audit_sessionid = audit_get_sessionid(current);
+	mp->valid |= KDBUS_ATTACH_AUDIT;
+#endif
+}
+
+/**
+ * kdbus_meta_proc_collect() - Collect process metadata
+ * @mp:		Process metadata object
+ * @what:	Attach flags to collect
+ *
+ * This collects process metadata from current and saves it in @mp.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_meta_proc_collect(struct kdbus_meta_proc *mp, u64 what)
+{
+	int ret;
+
+	if (!mp || !(what & (KDBUS_ATTACH_CREDS |
+			     KDBUS_ATTACH_PIDS |
+			     KDBUS_ATTACH_AUXGROUPS |
+			     KDBUS_ATTACH_TID_COMM |
+			     KDBUS_ATTACH_PID_COMM |
+			     KDBUS_ATTACH_EXE |
+			     KDBUS_ATTACH_CMDLINE |
+			     KDBUS_ATTACH_CGROUP |
+			     KDBUS_ATTACH_CAPS |
+			     KDBUS_ATTACH_SECLABEL |
+			     KDBUS_ATTACH_AUDIT)))
+		return 0;
+
+	mutex_lock(&mp->lock);
+
+	/* creds, auxgrps and caps share "struct cred" as context */
+	{
+		const u64 m_cred = KDBUS_ATTACH_CREDS |
+				   KDBUS_ATTACH_AUXGROUPS |
+				   KDBUS_ATTACH_CAPS;
+
+		if ((what & m_cred) && !(mp->collected & m_cred)) {
+			mp->cred = get_current_cred();
+			mp->valid |= m_cred;
+			mp->collected |= m_cred;
+		}
+	}
+
+	if ((what & KDBUS_ATTACH_PIDS) &&
+	    !(mp->collected & KDBUS_ATTACH_PIDS)) {
+		kdbus_meta_proc_collect_pids(mp);
+		mp->collected |= KDBUS_ATTACH_PIDS;
+	}
+
+	if ((what & KDBUS_ATTACH_TID_COMM) &&
+	    !(mp->collected & KDBUS_ATTACH_TID_COMM)) {
+		kdbus_meta_proc_collect_tid_comm(mp);
+		mp->collected |= KDBUS_ATTACH_TID_COMM;
+	}
+
+	if ((what & KDBUS_ATTACH_PID_COMM) &&
+	    !(mp->collected & KDBUS_ATTACH_PID_COMM)) {
+		kdbus_meta_proc_collect_pid_comm(mp);
+		mp->collected |= KDBUS_ATTACH_PID_COMM;
+	}
+
+	if ((what & KDBUS_ATTACH_EXE) &&
+	    !(mp->collected & KDBUS_ATTACH_EXE)) {
+		kdbus_meta_proc_collect_exe(mp);
+		mp->collected |= KDBUS_ATTACH_EXE;
+	}
+
+	if ((what & KDBUS_ATTACH_CMDLINE) &&
+	    !(mp->collected & KDBUS_ATTACH_CMDLINE)) {
+		ret = kdbus_meta_proc_collect_cmdline(mp);
+		if (ret < 0)
+			goto exit_unlock;
+		mp->collected |= KDBUS_ATTACH_CMDLINE;
+	}
+
+	if ((what & KDBUS_ATTACH_CGROUP) &&
+	    !(mp->collected & KDBUS_ATTACH_CGROUP)) {
+		ret = kdbus_meta_proc_collect_cgroup(mp);
+		if (ret < 0)
+			goto exit_unlock;
+		mp->collected |= KDBUS_ATTACH_CGROUP;
+	}
+
+	if ((what & KDBUS_ATTACH_SECLABEL) &&
+	    !(mp->collected & KDBUS_ATTACH_SECLABEL)) {
+		ret = kdbus_meta_proc_collect_seclabel(mp);
+		if (ret < 0)
+			goto exit_unlock;
+		mp->collected |= KDBUS_ATTACH_SECLABEL;
+	}
+
+	if ((what & KDBUS_ATTACH_AUDIT) &&
+	    !(mp->collected & KDBUS_ATTACH_AUDIT)) {
+		kdbus_meta_proc_collect_audit(mp);
+		mp->collected |= KDBUS_ATTACH_AUDIT;
+	}
+
+	ret = 0;
+
+exit_unlock:
+	mutex_unlock(&mp->lock);
+	return ret;
+}
+
+/**
+ * kdbus_meta_fake_new() - Create fake metadata object
+ *
+ * Return: Pointer to new object on success, ERR_PTR on failure.
+ */
+struct kdbus_meta_fake *kdbus_meta_fake_new(void)
+{
+	struct kdbus_meta_fake *mf;
+
+	mf = kzalloc(sizeof(*mf), GFP_KERNEL);
+	if (!mf)
+		return ERR_PTR(-ENOMEM);
+
+	return mf;
+}
+
+/**
+ * kdbus_meta_fake_free() - Free fake metadata object
+ * @mf:		Fake metadata object
+ *
+ * Return: NULL
+ */
+struct kdbus_meta_fake *kdbus_meta_fake_free(struct kdbus_meta_fake *mf)
+{
+	if (mf) {
+		put_pid(mf->ppid);
+		put_pid(mf->tgid);
+		put_pid(mf->pid);
+		kfree(mf->seclabel);
+		kfree(mf);
+	}
+
+	return NULL;
+}
+
+/**
+ * kdbus_meta_fake_collect() - Fill fake metadata from faked credentials
+ * @mf:		Fake metadata object
+ * @creds:	Creds to set, may be %NULL
+ * @pids:	PIDs to set, may be %NULL
+ * @seclabel:	Seclabel to set, may be %NULL
+ *
+ * This function takes information stored in @creds, @pids and @seclabel and
+ * resolves them to kernel-representations, if possible. This call uses the
+ * current task's namespaces to resolve the given information.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_meta_fake_collect(struct kdbus_meta_fake *mf,
+			    const struct kdbus_creds *creds,
+			    const struct kdbus_pids *pids,
+			    const char *seclabel)
+{
+	if (mf->valid)
+		return -EALREADY;
+
+	if (creds) {
+		struct user_namespace *ns = current_user_ns();
+
+		mf->uid		= make_kuid(ns, creds->uid);
+		mf->euid	= make_kuid(ns, creds->euid);
+		mf->suid	= make_kuid(ns, creds->suid);
+		mf->fsuid	= make_kuid(ns, creds->fsuid);
+
+		mf->gid		= make_kgid(ns, creds->gid);
+		mf->egid	= make_kgid(ns, creds->egid);
+		mf->sgid	= make_kgid(ns, creds->sgid);
+		mf->fsgid	= make_kgid(ns, creds->fsgid);
+
+		if ((creds->uid   != (uid_t)-1 && !uid_valid(mf->uid))   ||
+		    (creds->euid  != (uid_t)-1 && !uid_valid(mf->euid))  ||
+		    (creds->suid  != (uid_t)-1 && !uid_valid(mf->suid))  ||
+		    (creds->fsuid != (uid_t)-1 && !uid_valid(mf->fsuid)) ||
+		    (creds->gid   != (gid_t)-1 && !gid_valid(mf->gid))   ||
+		    (creds->egid  != (gid_t)-1 && !gid_valid(mf->egid))  ||
+		    (creds->sgid  != (gid_t)-1 && !gid_valid(mf->sgid))  ||
+		    (creds->fsgid != (gid_t)-1 && !gid_valid(mf->fsgid)))
+			return -EINVAL;
+
+		mf->valid |= KDBUS_ATTACH_CREDS;
+	}
+
+	if (pids) {
+		mf->pid = get_pid(find_vpid(pids->tid));
+		mf->tgid = get_pid(find_vpid(pids->pid));
+		mf->ppid = get_pid(find_vpid(pids->ppid));
+
+		if ((pids->tid != 0 && !mf->pid) ||
+		    (pids->pid != 0 && !mf->tgid) ||
+		    (pids->ppid != 0 && !mf->ppid)) {
+			put_pid(mf->pid);
+			put_pid(mf->tgid);
+			put_pid(mf->ppid);
+			mf->pid = NULL;
+			mf->tgid = NULL;
+			mf->ppid = NULL;
+			return -EINVAL;
+		}
+
+		mf->valid |= KDBUS_ATTACH_PIDS;
+	}
+
+	if (seclabel) {
+		mf->seclabel = kstrdup(seclabel, GFP_KERNEL);
+		if (!mf->seclabel)
+			return -ENOMEM;
+
+		mf->valid |= KDBUS_ATTACH_SECLABEL;
+	}
+
+	return 0;
+}
+
+/**
+ * kdbus_meta_conn_new() - Create connection metadata object
+ *
+ * Return: Pointer to new object on success, ERR_PTR on failure.
+ */
+struct kdbus_meta_conn *kdbus_meta_conn_new(void)
+{
+	struct kdbus_meta_conn *mc;
+
+	mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+	if (!mc)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&mc->kref);
+	mutex_init(&mc->lock);
+
+	return mc;
+}
+
+static void kdbus_meta_conn_free(struct kref *kref)
+{
+	struct kdbus_meta_conn *mc =
+		container_of(kref, struct kdbus_meta_conn, kref);
+
+	kfree(mc->conn_description);
+	kfree(mc->owned_names_items);
+	kfree(mc);
+}
+
+/**
+ * kdbus_meta_conn_ref() - Gain reference
+ * @mc:		Connection metadata object
+ */
+struct kdbus_meta_conn *kdbus_meta_conn_ref(struct kdbus_meta_conn *mc)
+{
+	if (mc)
+		kref_get(&mc->kref);
+	return mc;
+}
+
+/**
+ * kdbus_meta_conn_unref() - Drop reference
+ * @mc:		Connection metadata object
+ */
+struct kdbus_meta_conn *kdbus_meta_conn_unref(struct kdbus_meta_conn *mc)
+{
+	if (mc)
+		kref_put(&mc->kref, kdbus_meta_conn_free);
+	return NULL;
+}
+
+static void kdbus_meta_conn_collect_timestamp(struct kdbus_meta_conn *mc,
+					      u64 msg_seqnum)
+{
+	mc->ts.monotonic_ns = ktime_get_ns();
+	mc->ts.realtime_ns = ktime_get_real_ns();
+
+	if (msg_seqnum)
+		mc->ts.seqnum = msg_seqnum;
+
+	mc->valid |= KDBUS_ATTACH_TIMESTAMP;
+}
+
+static int kdbus_meta_conn_collect_names(struct kdbus_meta_conn *mc,
+					 struct kdbus_conn *conn)
+{
+	const struct kdbus_name_entry *e;
+	struct kdbus_item *item;
+	size_t slen, size;
+
+	lockdep_assert_held(&conn->ep->bus->name_registry->rwlock);
+
+	size = 0;
+	/* open-code length calculation to avoid final padding */
+	list_for_each_entry(e, &conn->names_list, conn_entry)
+		size = KDBUS_ALIGN8(size) + KDBUS_ITEM_HEADER_SIZE +
+			sizeof(struct kdbus_name) + strlen(e->name) + 1;
+
+	if (!size)
+		return 0;
+
+	/* make sure we include zeroed padding for convenience helpers */
+	item = kmalloc(KDBUS_ALIGN8(size), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	mc->owned_names_items = item;
+	mc->owned_names_size = size;
+
+	list_for_each_entry(e, &conn->names_list, conn_entry) {
+		slen = strlen(e->name) + 1;
+		kdbus_item_set(item, KDBUS_ITEM_OWNED_NAME, NULL,
+			       sizeof(struct kdbus_name) + slen);
+		item->name.flags = e->flags;
+		memcpy(item->name.name, e->name, slen);
+		item = KDBUS_ITEM_NEXT(item);
+	}
+
+	/* sanity check: the buffer should be completely written now */
+	WARN_ON((u8 *)item !=
+			(u8 *)mc->owned_names_items + KDBUS_ALIGN8(size));
+
+	mc->valid |= KDBUS_ATTACH_NAMES;
+	return 0;
+}
+
+static int kdbus_meta_conn_collect_description(struct kdbus_meta_conn *mc,
+					       struct kdbus_conn *conn)
+{
+	if (!conn->description)
+		return 0;
+
+	mc->conn_description = kstrdup(conn->description, GFP_KERNEL);
+	if (!mc->conn_description)
+		return -ENOMEM;
+
+	mc->valid |= KDBUS_ATTACH_CONN_DESCRIPTION;
+	return 0;
+}
+
+/**
+ * kdbus_meta_conn_collect() - Collect connection metadata
+ * @mc:		Message metadata object
+ * @conn:	Connection to collect data from
+ * @msg_seqnum:	Sequence number of the message to send
+ * @what:	Attach flags to collect
+ *
+ * This collects connection metadata from @msg_seqnum and @conn and saves it
+ * in @mc.
+ *
+ * If KDBUS_ATTACH_NAMES is set in @what and @conn is non-NULL, the caller must
+ * hold the name-registry read-lock of conn->ep->bus->registry.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_meta_conn_collect(struct kdbus_meta_conn *mc,
+			    struct kdbus_conn *conn,
+			    u64 msg_seqnum, u64 what)
+{
+	int ret;
+
+	if (!mc || !(what & (KDBUS_ATTACH_TIMESTAMP |
+			     KDBUS_ATTACH_NAMES |
+			     KDBUS_ATTACH_CONN_DESCRIPTION)))
+		return 0;
+
+	mutex_lock(&mc->lock);
+
+	if (msg_seqnum && (what & KDBUS_ATTACH_TIMESTAMP) &&
+	    !(mc->collected & KDBUS_ATTACH_TIMESTAMP)) {
+		kdbus_meta_conn_collect_timestamp(mc, msg_seqnum);
+		mc->collected |= KDBUS_ATTACH_TIMESTAMP;
+	}
+
+	if (conn && (what & KDBUS_ATTACH_NAMES) &&
+	    !(mc->collected & KDBUS_ATTACH_NAMES)) {
+		ret = kdbus_meta_conn_collect_names(mc, conn);
+		if (ret < 0)
+			goto exit_unlock;
+		mc->collected |= KDBUS_ATTACH_NAMES;
+	}
+
+	if (conn && (what & KDBUS_ATTACH_CONN_DESCRIPTION) &&
+	    !(mc->collected & KDBUS_ATTACH_CONN_DESCRIPTION)) {
+		ret = kdbus_meta_conn_collect_description(mc, conn);
+		if (ret < 0)
+			goto exit_unlock;
+		mc->collected |= KDBUS_ATTACH_CONN_DESCRIPTION;
+	}
+
+	ret = 0;
+
+exit_unlock:
+	mutex_unlock(&mc->lock);
+	return ret;
+}
+
+static void kdbus_meta_export_caps(struct kdbus_meta_caps *out,
+				   const struct kdbus_meta_proc *mp,
+				   struct user_namespace *user_ns)
+{
+	struct user_namespace *iter;
+	const struct cred *cred = mp->cred;
+	bool parent = false, owner = false;
+	int i;
+
+	/*
+	 * This translates the effective capabilities of 'cred' into the given
+	 * user-namespace. If the given user-namespace is a child-namespace of
+	 * the user-namespace of 'cred', the mask can be copied verbatim. If
+	 * not, the mask is cleared.
+	 * There's one exception: If 'cred' is the owner of any user-namespace
+	 * in the path between the given user-namespace and the user-namespace
+	 * of 'cred', then it has all effective capabilities set. This means,
+	 * the user who created a user-namespace always has all effective
+	 * capabilities in any child namespaces. Note that this is based on the
+	 * uid of the namespace creator, not the task hierarchy.
+	 */
+	for (iter = user_ns; iter; iter = iter->parent) {
+		if (iter == cred->user_ns) {
+			parent = true;
+			break;
+		}
+
+		if (iter == &init_user_ns)
+			break;
+
+		if ((iter->parent == cred->user_ns) &&
+		    uid_eq(iter->owner, cred->euid)) {
+			owner = true;
+			break;
+		}
+	}
+
+	out->last_cap = CAP_LAST_CAP;
+
+	CAP_FOR_EACH_U32(i) {
+		if (parent) {
+			out->set[0].caps[i] = cred->cap_inheritable.cap[i];
+			out->set[1].caps[i] = cred->cap_permitted.cap[i];
+			out->set[2].caps[i] = cred->cap_effective.cap[i];
+			out->set[3].caps[i] = cred->cap_bset.cap[i];
+		} else if (owner) {
+			out->set[0].caps[i] = 0U;
+			out->set[1].caps[i] = ~0U;
+			out->set[2].caps[i] = ~0U;
+			out->set[3].caps[i] = ~0U;
+		} else {
+			out->set[0].caps[i] = 0U;
+			out->set[1].caps[i] = 0U;
+			out->set[2].caps[i] = 0U;
+			out->set[3].caps[i] = 0U;
+		}
+	}
+
+	/* clear unused bits */
+	for (i = 0; i < 4; i++)
+		out->set[i].caps[CAP_TO_INDEX(CAP_LAST_CAP)] &=
+					CAP_LAST_U32_VALID_MASK;
+}
+
+/* This is equivalent to from_kuid_munged(), but maps INVALID_UID to itself */
+static uid_t kdbus_from_kuid_keep(struct user_namespace *ns, kuid_t uid)
+{
+	return uid_valid(uid) ? from_kuid_munged(ns, uid) : ((uid_t)-1);
+}
+
+/* This is equivalent to from_kgid_munged(), but maps INVALID_GID to itself */
+static gid_t kdbus_from_kgid_keep(struct user_namespace *ns, kgid_t gid)
+{
+	return gid_valid(gid) ? from_kgid_munged(ns, gid) : ((gid_t)-1);
+}
+
+struct kdbus_meta_staging {
+	const struct kdbus_meta_proc *mp;
+	const struct kdbus_meta_fake *mf;
+	const struct kdbus_meta_conn *mc;
+	const struct kdbus_conn *conn;
+	u64 mask;
+
+	void *exe;
+	const char *exe_path;
+};
+
+static size_t kdbus_meta_measure(struct kdbus_meta_staging *staging)
+{
+	const struct kdbus_meta_proc *mp = staging->mp;
+	const struct kdbus_meta_fake *mf = staging->mf;
+	const struct kdbus_meta_conn *mc = staging->mc;
+	const u64 mask = staging->mask;
+	size_t size = 0;
+
+	/* process metadata */
+
+	if (mf && (mask & KDBUS_ATTACH_CREDS))
+		size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_creds));
+	else if (mp && (mask & KDBUS_ATTACH_CREDS))
+		size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_creds));
+
+	if (mf && (mask & KDBUS_ATTACH_PIDS))
+		size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_pids));
+	else if (mp && (mask & KDBUS_ATTACH_PIDS))
+		size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_pids));
+
+	if (mp && (mask & KDBUS_ATTACH_AUXGROUPS))
+		size += KDBUS_ITEM_SIZE(mp->cred->group_info->ngroups *
+					sizeof(u64));
+
+	if (mp && (mask & KDBUS_ATTACH_TID_COMM))
+		size += KDBUS_ITEM_SIZE(strlen(mp->tid_comm) + 1);
+
+	if (mp && (mask & KDBUS_ATTACH_PID_COMM))
+		size += KDBUS_ITEM_SIZE(strlen(mp->pid_comm) + 1);
+
+	if (staging->exe_path && (mask & KDBUS_ATTACH_EXE))
+		size += KDBUS_ITEM_SIZE(strlen(staging->exe_path) + 1);
+
+	if (mp && (mask & KDBUS_ATTACH_CMDLINE))
+		size += KDBUS_ITEM_SIZE(strlen(mp->cmdline) + 1);
+
+	if (mp && (mask & KDBUS_ATTACH_CGROUP))
+		size += KDBUS_ITEM_SIZE(strlen(mp->cgroup) + 1);
+
+	if (mp && (mask & KDBUS_ATTACH_CAPS))
+		size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_meta_caps));
+
+	if (mf && (mask & KDBUS_ATTACH_SECLABEL))
+		size += KDBUS_ITEM_SIZE(strlen(mf->seclabel) + 1);
+	else if (mp && (mask & KDBUS_ATTACH_SECLABEL))
+		size += KDBUS_ITEM_SIZE(strlen(mp->seclabel) + 1);
+
+	if (mp && (mask & KDBUS_ATTACH_AUDIT))
+		size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_audit));
+
+	/* connection metadata */
+
+	if (mc && (mask & KDBUS_ATTACH_NAMES))
+		size += KDBUS_ALIGN8(mc->owned_names_size);
+
+	if (mc && (mask & KDBUS_ATTACH_CONN_DESCRIPTION))
+		size += KDBUS_ITEM_SIZE(strlen(mc->conn_description) + 1);
+
+	if (mc && (mask & KDBUS_ATTACH_TIMESTAMP))
+		size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_timestamp));
+
+	return size;
+}
+
+static struct kdbus_item *kdbus_write_head(struct kdbus_item **iter,
+					   u64 type, u64 size)
+{
+	struct kdbus_item *item = *iter;
+	size_t padding;
+
+	item->type = type;
+	item->size = KDBUS_ITEM_HEADER_SIZE + size;
+
+	/* clear padding */
+	padding = KDBUS_ALIGN8(item->size) - item->size;
+	if (padding)
+		memset(item->data + size, 0, padding);
+
+	*iter = KDBUS_ITEM_NEXT(item);
+	return item;
+}
+
+static struct kdbus_item *kdbus_write_full(struct kdbus_item **iter,
+					   u64 type, u64 size, const void *data)
+{
+	struct kdbus_item *item;
+
+	item = kdbus_write_head(iter, type, size);
+	memcpy(item->data, data, size);
+	return item;
+}
+
+static size_t kdbus_meta_write(struct kdbus_meta_staging *staging, void *mem,
+			       size_t size)
+{
+	struct user_namespace *user_ns = staging->conn->cred->user_ns;
+	struct pid_namespace *pid_ns = ns_of_pid(staging->conn->pid);
+	struct kdbus_item *item = NULL, *items = mem;
+	u8 *end, *owned_names_end = NULL;
+
+	/* process metadata */
+
+	if (staging->mf && (staging->mask & KDBUS_ATTACH_CREDS)) {
+		const struct kdbus_meta_fake *mf = staging->mf;
+
+		item = kdbus_write_head(&items, KDBUS_ITEM_CREDS,
+					sizeof(struct kdbus_creds));
+		item->creds = (struct kdbus_creds){
+			.uid	= kdbus_from_kuid_keep(user_ns, mf->uid),
+			.euid	= kdbus_from_kuid_keep(user_ns, mf->euid),
+			.suid	= kdbus_from_kuid_keep(user_ns, mf->suid),
+			.fsuid	= kdbus_from_kuid_keep(user_ns, mf->fsuid),
+			.gid	= kdbus_from_kgid_keep(user_ns, mf->gid),
+			.egid	= kdbus_from_kgid_keep(user_ns, mf->egid),
+			.sgid	= kdbus_from_kgid_keep(user_ns, mf->sgid),
+			.fsgid	= kdbus_from_kgid_keep(user_ns, mf->fsgid),
+		};
+	} else if (staging->mp && (staging->mask & KDBUS_ATTACH_CREDS)) {
+		const struct cred *c = staging->mp->cred;
+
+		item = kdbus_write_head(&items, KDBUS_ITEM_CREDS,
+					sizeof(struct kdbus_creds));
+		item->creds = (struct kdbus_creds){
+			.uid	= kdbus_from_kuid_keep(user_ns, c->uid),
+			.euid	= kdbus_from_kuid_keep(user_ns, c->euid),
+			.suid	= kdbus_from_kuid_keep(user_ns, c->suid),
+			.fsuid	= kdbus_from_kuid_keep(user_ns, c->fsuid),
+			.gid	= kdbus_from_kgid_keep(user_ns, c->gid),
+			.egid	= kdbus_from_kgid_keep(user_ns, c->egid),
+			.sgid	= kdbus_from_kgid_keep(user_ns, c->sgid),
+			.fsgid	= kdbus_from_kgid_keep(user_ns, c->fsgid),
+		};
+	}
+
+	if (staging->mf && (staging->mask & KDBUS_ATTACH_PIDS)) {
+		item = kdbus_write_head(&items, KDBUS_ITEM_PIDS,
+					sizeof(struct kdbus_pids));
+		item->pids = (struct kdbus_pids){
+			.pid = pid_nr_ns(staging->mf->tgid, pid_ns),
+			.tid = pid_nr_ns(staging->mf->pid, pid_ns),
+			.ppid = pid_nr_ns(staging->mf->ppid, pid_ns),
+		};
+	} else if (staging->mp && (staging->mask & KDBUS_ATTACH_PIDS)) {
+		item = kdbus_write_head(&items, KDBUS_ITEM_PIDS,
+					sizeof(struct kdbus_pids));
+		item->pids = (struct kdbus_pids){
+			.pid = pid_nr_ns(staging->mp->tgid, pid_ns),
+			.tid = pid_nr_ns(staging->mp->pid, pid_ns),
+			.ppid = pid_nr_ns(staging->mp->ppid, pid_ns),
+		};
+	}
+
+	if (staging->mp && (staging->mask & KDBUS_ATTACH_AUXGROUPS)) {
+		const struct group_info *info = staging->mp->cred->group_info;
+		size_t i;
+
+		item = kdbus_write_head(&items, KDBUS_ITEM_AUXGROUPS,
+					info->ngroups * sizeof(u64));
+		for (i = 0; i < info->ngroups; ++i)
+			item->data64[i] = from_kgid_munged(user_ns,
+							   GROUP_AT(info, i));
+	}
+
+	if (staging->mp && (staging->mask & KDBUS_ATTACH_TID_COMM))
+		item = kdbus_write_full(&items, KDBUS_ITEM_TID_COMM,
+					strlen(staging->mp->tid_comm) + 1,
+					staging->mp->tid_comm);
+
+	if (staging->mp && (staging->mask & KDBUS_ATTACH_PID_COMM))
+		item = kdbus_write_full(&items, KDBUS_ITEM_PID_COMM,
+					strlen(staging->mp->pid_comm) + 1,
+					staging->mp->pid_comm);
+
+	if (staging->exe_path && (staging->mask & KDBUS_ATTACH_EXE))
+		item = kdbus_write_full(&items, KDBUS_ITEM_EXE,
+					strlen(staging->exe_path) + 1,
+					staging->exe_path);
+
+	if (staging->mp && (staging->mask & KDBUS_ATTACH_CMDLINE))
+		item = kdbus_write_full(&items, KDBUS_ITEM_CMDLINE,
+					strlen(staging->mp->cmdline) + 1,
+					staging->mp->cmdline);
+
+	if (staging->mp && (staging->mask & KDBUS_ATTACH_CGROUP))
+		item = kdbus_write_full(&items, KDBUS_ITEM_CGROUP,
+					strlen(staging->mp->cgroup) + 1,
+					staging->mp->cgroup);
+
+	if (staging->mp && (staging->mask & KDBUS_ATTACH_CAPS)) {
+		item = kdbus_write_head(&items, KDBUS_ITEM_CAPS,
+					sizeof(struct kdbus_meta_caps));
+		kdbus_meta_export_caps((void*)&item->caps, staging->mp,
+				       user_ns);
+	}
+
+	if (staging->mf && (staging->mask & KDBUS_ATTACH_SECLABEL))
+		item = kdbus_write_full(&items, KDBUS_ITEM_SECLABEL,
+					strlen(staging->mf->seclabel) + 1,
+					staging->mf->seclabel);
+	else if (staging->mp && (staging->mask & KDBUS_ATTACH_SECLABEL))
+		item = kdbus_write_full(&items, KDBUS_ITEM_SECLABEL,
+					strlen(staging->mp->seclabel) + 1,
+					staging->mp->seclabel);
+
+	if (staging->mp && (staging->mask & KDBUS_ATTACH_AUDIT)) {
+		item = kdbus_write_head(&items, KDBUS_ITEM_AUDIT,
+					sizeof(struct kdbus_audit));
+		item->audit = (struct kdbus_audit){
+			.loginuid = from_kuid(user_ns,
+					      staging->mp->audit_loginuid),
+			.sessionid = staging->mp->audit_sessionid,
+		};
+	}
+
+	/* connection metadata */
+
+	if (staging->mc && (staging->mask & KDBUS_ATTACH_NAMES)) {
+		memcpy(items, staging->mc->owned_names_items,
+		       KDBUS_ALIGN8(staging->mc->owned_names_size));
+		owned_names_end = (u8 *)items + staging->mc->owned_names_size;
+		items = (void *)KDBUS_ALIGN8((unsigned long)owned_names_end);
+	}
+
+	if (staging->mc && (staging->mask & KDBUS_ATTACH_CONN_DESCRIPTION))
+		item = kdbus_write_full(&items, KDBUS_ITEM_CONN_DESCRIPTION,
+				strlen(staging->mc->conn_description) + 1,
+				staging->mc->conn_description);
+
+	if (staging->mc && (staging->mask & KDBUS_ATTACH_TIMESTAMP))
+		item = kdbus_write_full(&items, KDBUS_ITEM_TIMESTAMP,
+					sizeof(staging->mc->ts),
+					&staging->mc->ts);
+
+	/*
+	 * Return real size (minus trailing padding). In case of 'owned_names'
+	 * we cannot deduce it from item->size, so treat it special.
+	 */
+
+	if (items == (void *)KDBUS_ALIGN8((unsigned long)owned_names_end))
+		end = owned_names_end;
+	else if (item)
+		end = (u8 *)item + item->size;
+	else
+		end = mem;
+
+	WARN_ON((u8 *)items - (u8 *)mem != size);
+	WARN_ON((void *)KDBUS_ALIGN8((unsigned long)end) != (void *)items);
+
+	return end - (u8 *)mem;
+}
+
+int kdbus_meta_emit(struct kdbus_meta_proc *mp,
+		    struct kdbus_meta_fake *mf,
+		    struct kdbus_meta_conn *mc,
+		    struct kdbus_conn *conn,
+		    u64 mask,
+		    struct kdbus_item **out_items,
+		    size_t *out_size)
+{
+	struct kdbus_meta_staging staging = {};
+	struct kdbus_item *items = NULL;
+	size_t size = 0;
+	int ret;
+
+	if (WARN_ON(mf && mp))
+		mp = NULL;
+
+	staging.mp = mp;
+	staging.mf = mf;
+	staging.mc = mc;
+	staging.conn = conn;
+
+	/* get mask of valid items */
+	if (mf)
+		staging.mask |= mf->valid;
+	if (mp) {
+		mutex_lock(&mp->lock);
+		staging.mask |= mp->valid;
+		mutex_unlock(&mp->lock);
+	}
+	if (mc) {
+		mutex_lock(&mc->lock);
+		staging.mask |= mc->valid;
+		mutex_unlock(&mc->lock);
+	}
+
+	staging.mask &= mask;
+
+	if (!staging.mask) { /* bail out if nothing to do */
+		ret = 0;
+		goto exit;
+	}
+
+	/* EXE is special as it needs a temporary page to assemble */
+	if (mp && (staging.mask & KDBUS_ATTACH_EXE)) {
+		struct path p;
+
+		/*
+		 * XXX: We need access to __d_path() so we can write the path
+		 * relative to conn->root_path. Once upstream, we need
+		 * EXPORT_SYMBOL(__d_path) or an equivalent of d_path() that
+		 * takes the root path directly. Until then, we drop this item
+		 * if the root-paths differ.
+		 */
+
+		get_fs_root(current->fs, &p);
+		if (path_equal(&p, &conn->root_path)) {
+			staging.exe = (void *)__get_free_page(GFP_TEMPORARY);
+			if (!staging.exe) {
+				path_put(&p);
+				ret = -ENOMEM;
+				goto exit;
+			}
+
+			staging.exe_path = d_path(&mp->exe_path, staging.exe,
+						  PAGE_SIZE);
+			if (IS_ERR(staging.exe_path)) {
+				path_put(&p);
+				ret = PTR_ERR(staging.exe_path);
+				goto exit;
+			}
+		}
+		path_put(&p);
+	}
+
+	size = kdbus_meta_measure(&staging);
+	if (!size) { /* bail out if nothing to do */
+		ret = 0;
+		goto exit;
+	}
+
+	items = kmalloc(size, GFP_KERNEL);
+	if (!items) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	size = kdbus_meta_write(&staging, items, size);
+	if (!size) {
+		kfree(items);
+		items = NULL;
+	}
+
+	ret = 0;
+
+exit:
+	if (staging.exe)
+		free_page((unsigned long)staging.exe);
+	if (ret >= 0) {
+		*out_items = items;
+		*out_size = size;
+	}
+	return ret;
+}
+
+enum {
+	KDBUS_META_PROC_NONE,
+	KDBUS_META_PROC_NORMAL,
+};
+
+/**
+ * kdbus_proc_permission() - check /proc permissions on target pid
+ * @pid_ns:		namespace we operate in
+ * @cred:		credentials of requestor
+ * @target:		target process
+ *
+ * This checks whether a process with credentials @cred can access information
+ * of @target in the namespace @pid_ns. This tries to follow /proc permissions,
+ * but is slightly more restrictive.
+ *
+ * Return: The /proc access level (KDBUS_META_PROC_*) is returned.
+ */
+static unsigned int kdbus_proc_permission(const struct pid_namespace *pid_ns,
+					  const struct cred *cred,
+					  struct pid *target)
+{
+	if (pid_ns->hide_pid < 1)
+		return KDBUS_META_PROC_NORMAL;
+
+	/* XXX: we need groups_search() exported for aux-groups */
+	if (gid_eq(cred->egid, pid_ns->pid_gid))
+		return KDBUS_META_PROC_NORMAL;
+
+	/*
+	 * XXX: If ptrace_may_access(PTRACE_MODE_READ) is granted, you can
+	 * overwrite hide_pid. However, ptrace_may_access() only supports
+	 * checking 'current', hence, we cannot use this here. But we
+	 * simply decide to not support this override, so no need to worry.
+	 */
+
+	return KDBUS_META_PROC_NONE;
+}
+
+/**
+ * kdbus_meta_proc_mask() - calculate which metadata would be visible to
+ *			    a connection via /proc
+ * @prv_pid:		pid of metadata provider
+ * @req_pid:		pid of metadata requestor
+ * @req_cred:		credentials of metadata reqeuestor
+ * @wanted:		metadata that is requested
+ *
+ * This checks which metadata items of @prv_pid can be read via /proc by the
+ * requestor @req_pid.
+ *
+ * Return: Set of metadata flags the requestor can see (limited by @wanted).
+ */
+static u64 kdbus_meta_proc_mask(struct pid *prv_pid,
+				struct pid *req_pid,
+				const struct cred *req_cred,
+				u64 wanted)
+{
+	struct pid_namespace *prv_ns, *req_ns;
+	unsigned int proc;
+
+	prv_ns = ns_of_pid(prv_pid);
+	req_ns = ns_of_pid(req_pid);
+
+	/*
+	 * If the sender is not visible in the receiver namespace, then the
+	 * receiver cannot access the sender via its own procfs. Hence, we do
+	 * not attach any additional metadata.
+	 */
+	if (!pid_nr_ns(prv_pid, req_ns))
+		return 0;
+
+	/*
+	 * If the pid-namespace of the receiver has hide_pid set, it cannot see
+	 * any process but its own. We shortcut this /proc permission check if
+	 * provider and requestor are the same. If not, we perform rather
+	 * expensive /proc permission checks.
+	 */
+	if (prv_pid == req_pid)
+		proc = KDBUS_META_PROC_NORMAL;
+	else
+		proc = kdbus_proc_permission(req_ns, req_cred, prv_pid);
+
+	/* you need /proc access to read standard process attributes */
+	if (proc < KDBUS_META_PROC_NORMAL)
+		wanted &= ~(KDBUS_ATTACH_TID_COMM |
+			    KDBUS_ATTACH_PID_COMM |
+			    KDBUS_ATTACH_SECLABEL |
+			    KDBUS_ATTACH_CMDLINE |
+			    KDBUS_ATTACH_CGROUP |
+			    KDBUS_ATTACH_AUDIT |
+			    KDBUS_ATTACH_CAPS |
+			    KDBUS_ATTACH_EXE);
+
+	/* clear all non-/proc flags */
+	return wanted & (KDBUS_ATTACH_TID_COMM |
+			 KDBUS_ATTACH_PID_COMM |
+			 KDBUS_ATTACH_SECLABEL |
+			 KDBUS_ATTACH_CMDLINE |
+			 KDBUS_ATTACH_CGROUP |
+			 KDBUS_ATTACH_AUDIT |
+			 KDBUS_ATTACH_CAPS |
+			 KDBUS_ATTACH_EXE);
+}
+
+/**
+ * kdbus_meta_get_mask() - calculate attach flags mask for metadata request
+ * @prv_pid:		pid of metadata provider
+ * @prv_mask:		mask of metadata the provide grants unchecked
+ * @req_pid:		pid of metadata requestor
+ * @req_cred:		credentials of metadata requestor
+ * @req_mask:		mask of metadata that is requested
+ *
+ * This calculates the metadata items that the requestor @req_pid can access
+ * from the metadata provider @prv_pid. This permission check consists of
+ * several different parts:
+ *  - Providers can grant metadata items unchecked. Regardless of their type,
+ *    they're always granted to the requestor. This mask is passed as @prv_mask.
+ *  - Basic items (credentials and connection metadata) are granted implicitly
+ *    to everyone. They're publicly available to any bus-user that can see the
+ *    provider.
+ *  - Process credentials that are not granted implicitly follow the same
+ *    permission checks as /proc. This means, we always assume a requestor
+ *    process has access to their *own* /proc mount, if they have access to
+ *    kdbusfs.
+ *
+ * Return: Mask of metadata that is granted.
+ */
+static u64 kdbus_meta_get_mask(struct pid *prv_pid, u64 prv_mask,
+			       struct pid *req_pid,
+			       const struct cred *req_cred, u64 req_mask)
+{
+	u64 missing, impl_mask, proc_mask = 0;
+
+	/*
+	 * Connection metadata and basic unix process credentials are
+	 * transmitted implicitly, and cannot be suppressed. Both are required
+	 * to perform user-space policies on the receiver-side. Furthermore,
+	 * connection metadata is public state, anyway, and unix credentials
+	 * are needed for UDS-compatibility. We extend them slightly by
+	 * auxiliary groups and additional uids/gids/pids.
+	 */
+	impl_mask = /* connection metadata */
+		    KDBUS_ATTACH_CONN_DESCRIPTION |
+		    KDBUS_ATTACH_TIMESTAMP |
+		    KDBUS_ATTACH_NAMES |
+		    /* credentials and pids */
+		    KDBUS_ATTACH_AUXGROUPS |
+		    KDBUS_ATTACH_CREDS |
+		    KDBUS_ATTACH_PIDS;
+
+	/*
+	 * Calculate the set of metadata that is not granted implicitly nor by
+	 * the sender, but still requested by the receiver. If any are left,
+	 * perform rather expensive /proc access checks for them.
+	 */
+	missing = req_mask & ~((prv_mask | impl_mask) & req_mask);
+	if (missing)
+		proc_mask = kdbus_meta_proc_mask(prv_pid, req_pid, req_cred,
+						 missing);
+
+	return (prv_mask | impl_mask | proc_mask) & req_mask;
+}
+
+/**
+ */
+u64 kdbus_meta_info_mask(const struct kdbus_conn *conn, u64 mask)
+{
+	return kdbus_meta_get_mask(conn->pid,
+				   atomic64_read(&conn->attach_flags_send),
+				   task_pid(current),
+				   current_cred(),
+				   mask);
+}
+
+/**
+ */
+u64 kdbus_meta_msg_mask(const struct kdbus_conn *snd,
+			const struct kdbus_conn *rcv)
+{
+	return kdbus_meta_get_mask(task_pid(current),
+				   atomic64_read(&snd->attach_flags_send),
+				   rcv->pid,
+				   rcv->cred,
+				   atomic64_read(&rcv->attach_flags_recv));
+}
diff --git a/ipc/kdbus/metadata.h b/ipc/kdbus/metadata.h
new file mode 100644
index 000000000..dba7cc7fd
--- /dev/null
+++ b/ipc/kdbus/metadata.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_METADATA_H
+#define __KDBUS_METADATA_H
+
+#include <linux/kernel.h>
+
+struct kdbus_conn;
+struct kdbus_pool_slice;
+
+struct kdbus_meta_proc;
+struct kdbus_meta_conn;
+
+/**
+ * struct kdbus_meta_fake - Fake metadata
+ * @valid:		Bitmask of collected and valid items
+ * @uid:		UID of process
+ * @euid:		EUID of process
+ * @suid:		SUID of process
+ * @fsuid:		FSUID of process
+ * @gid:		GID of process
+ * @egid:		EGID of process
+ * @sgid:		SGID of process
+ * @fsgid:		FSGID of process
+ * @pid:		PID of process
+ * @tgid:		TGID of process
+ * @ppid:		PPID of process
+ * @seclabel:		Seclabel
+ */
+struct kdbus_meta_fake {
+	u64 valid;
+
+	/* KDBUS_ITEM_CREDS */
+	kuid_t uid, euid, suid, fsuid;
+	kgid_t gid, egid, sgid, fsgid;
+
+	/* KDBUS_ITEM_PIDS */
+	struct pid *pid, *tgid, *ppid;
+
+	/* KDBUS_ITEM_SECLABEL */
+	char *seclabel;
+};
+
+struct kdbus_meta_proc *kdbus_meta_proc_new(void);
+struct kdbus_meta_proc *kdbus_meta_proc_ref(struct kdbus_meta_proc *mp);
+struct kdbus_meta_proc *kdbus_meta_proc_unref(struct kdbus_meta_proc *mp);
+int kdbus_meta_proc_collect(struct kdbus_meta_proc *mp, u64 what);
+
+struct kdbus_meta_fake *kdbus_meta_fake_new(void);
+struct kdbus_meta_fake *kdbus_meta_fake_free(struct kdbus_meta_fake *mf);
+int kdbus_meta_fake_collect(struct kdbus_meta_fake *mf,
+			    const struct kdbus_creds *creds,
+			    const struct kdbus_pids *pids,
+			    const char *seclabel);
+
+struct kdbus_meta_conn *kdbus_meta_conn_new(void);
+struct kdbus_meta_conn *kdbus_meta_conn_ref(struct kdbus_meta_conn *mc);
+struct kdbus_meta_conn *kdbus_meta_conn_unref(struct kdbus_meta_conn *mc);
+int kdbus_meta_conn_collect(struct kdbus_meta_conn *mc,
+			    struct kdbus_conn *conn,
+			    u64 msg_seqnum, u64 what);
+
+int kdbus_meta_emit(struct kdbus_meta_proc *mp,
+		    struct kdbus_meta_fake *mf,
+		    struct kdbus_meta_conn *mc,
+		    struct kdbus_conn *conn,
+		    u64 mask,
+		    struct kdbus_item **out_items,
+		    size_t *out_size);
+u64 kdbus_meta_info_mask(const struct kdbus_conn *conn, u64 mask);
+u64 kdbus_meta_msg_mask(const struct kdbus_conn *snd,
+			const struct kdbus_conn *rcv);
+
+#endif
diff --git a/ipc/kdbus/names.c b/ipc/kdbus/names.c
new file mode 100644
index 000000000..057f8061c
--- /dev/null
+++ b/ipc/kdbus/names.c
@@ -0,0 +1,770 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/hash.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "handle.h"
+#include "item.h"
+#include "names.h"
+#include "notify.h"
+#include "policy.h"
+
+struct kdbus_name_pending {
+	u64 flags;
+	struct kdbus_conn *conn;
+	struct kdbus_name_entry *name;
+	struct list_head conn_entry;
+	struct list_head name_entry;
+};
+
+static int kdbus_name_pending_new(struct kdbus_name_entry *e,
+				  struct kdbus_conn *conn, u64 flags)
+{
+	struct kdbus_name_pending *p;
+
+	kdbus_conn_assert_active(conn);
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->flags = flags;
+	p->conn = conn;
+	p->name = e;
+	list_add_tail(&p->conn_entry, &conn->names_queue_list);
+	list_add_tail(&p->name_entry, &e->queue);
+
+	return 0;
+}
+
+static void kdbus_name_pending_free(struct kdbus_name_pending *p)
+{
+	if (!p)
+		return;
+
+	list_del(&p->name_entry);
+	list_del(&p->conn_entry);
+	kfree(p);
+}
+
+static struct kdbus_name_entry *
+kdbus_name_entry_new(struct kdbus_name_registry *r, u32 hash, const char *name)
+{
+	struct kdbus_name_entry *e;
+	size_t namelen;
+
+	namelen = strlen(name);
+
+	e = kmalloc(sizeof(*e) + namelen + 1, GFP_KERNEL);
+	if (!e)
+		return ERR_PTR(-ENOMEM);
+
+	e->name_id = ++r->name_seq_last;
+	e->flags = 0;
+	e->conn = NULL;
+	e->activator = NULL;
+	INIT_LIST_HEAD(&e->queue);
+	INIT_LIST_HEAD(&e->conn_entry);
+	hash_add(r->entries_hash, &e->hentry, hash);
+	memcpy(e->name, name, namelen + 1);
+
+	return e;
+}
+
+static void kdbus_name_entry_free(struct kdbus_name_entry *e)
+{
+	if (!e)
+		return;
+
+	WARN_ON(!list_empty(&e->conn_entry));
+	WARN_ON(!list_empty(&e->queue));
+	WARN_ON(e->activator);
+	WARN_ON(e->conn);
+
+	hash_del(&e->hentry);
+	kfree(e);
+}
+
+static void kdbus_name_entry_set_owner(struct kdbus_name_entry *e,
+				       struct kdbus_conn *conn, u64 flags)
+{
+	WARN_ON(e->conn);
+
+	e->conn = kdbus_conn_ref(conn);
+	e->flags = flags;
+	atomic_inc(&conn->name_count);
+	list_add_tail(&e->conn_entry, &e->conn->names_list);
+}
+
+static void kdbus_name_entry_remove_owner(struct kdbus_name_entry *e)
+{
+	WARN_ON(!e->conn);
+
+	list_del_init(&e->conn_entry);
+	atomic_dec(&e->conn->name_count);
+	e->flags = 0;
+	e->conn = kdbus_conn_unref(e->conn);
+}
+
+static void kdbus_name_entry_replace_owner(struct kdbus_name_entry *e,
+					   struct kdbus_conn *conn, u64 flags)
+{
+	if (WARN_ON(!e->conn) || WARN_ON(conn == e->conn))
+		return;
+
+	kdbus_notify_name_change(conn->ep->bus, KDBUS_ITEM_NAME_CHANGE,
+				 e->conn->id, conn->id,
+				 e->flags, flags, e->name);
+	kdbus_name_entry_remove_owner(e);
+	kdbus_name_entry_set_owner(e, conn, flags);
+}
+
+/**
+ * kdbus_name_is_valid() - check if a name is valid
+ * @p:			The name to check
+ * @allow_wildcard:	Whether or not to allow a wildcard name
+ *
+ * A name is valid if all of the following criterias are met:
+ *
+ *  - The name has two or more elements separated by a period ('.') character.
+ *  - All elements must contain at least one character.
+ *  - Each element must only contain the ASCII characters "[A-Z][a-z][0-9]_-"
+ *    and must not begin with a digit.
+ *  - The name must not exceed KDBUS_NAME_MAX_LEN.
+ *  - If @allow_wildcard is true, the name may end on '.*'
+ */
+bool kdbus_name_is_valid(const char *p, bool allow_wildcard)
+{
+	bool dot, found_dot = false;
+	const char *q;
+
+	for (dot = true, q = p; *q; q++) {
+		if (*q == '.') {
+			if (dot)
+				return false;
+
+			found_dot = true;
+			dot = true;
+		} else {
+			bool good;
+
+			good = isalpha(*q) || (!dot && isdigit(*q)) ||
+				*q == '_' || *q == '-' ||
+				(allow_wildcard && dot &&
+					*q == '*' && *(q + 1) == '\0');
+
+			if (!good)
+				return false;
+
+			dot = false;
+		}
+	}
+
+	if (q - p > KDBUS_NAME_MAX_LEN)
+		return false;
+
+	if (dot)
+		return false;
+
+	if (!found_dot)
+		return false;
+
+	return true;
+}
+
+/**
+ * kdbus_name_registry_new() - create a new name registry
+ *
+ * Return: a new kdbus_name_registry on success, ERR_PTR on failure.
+ */
+struct kdbus_name_registry *kdbus_name_registry_new(void)
+{
+	struct kdbus_name_registry *r;
+
+	r = kmalloc(sizeof(*r), GFP_KERNEL);
+	if (!r)
+		return ERR_PTR(-ENOMEM);
+
+	hash_init(r->entries_hash);
+	init_rwsem(&r->rwlock);
+	r->name_seq_last = 0;
+
+	return r;
+}
+
+/**
+ * kdbus_name_registry_free() - drop a name reg's reference
+ * @reg:		The name registry, may be %NULL
+ *
+ * Cleanup the name registry's internal structures.
+ */
+void kdbus_name_registry_free(struct kdbus_name_registry *reg)
+{
+	if (!reg)
+		return;
+
+	WARN_ON(!hash_empty(reg->entries_hash));
+	kfree(reg);
+}
+
+static struct kdbus_name_entry *
+kdbus_name_find(struct kdbus_name_registry *reg, u32 hash, const char *name)
+{
+	struct kdbus_name_entry *e;
+
+	lockdep_assert_held(&reg->rwlock);
+
+	hash_for_each_possible(reg->entries_hash, e, hentry, hash)
+		if (strcmp(e->name, name) == 0)
+			return e;
+
+	return NULL;
+}
+
+/**
+ * kdbus_name_lookup_unlocked() - lookup name in registry
+ * @reg:		name registry
+ * @name:		name to lookup
+ *
+ * This looks up @name in the given name-registry and returns the
+ * kdbus_name_entry object. The caller must hold the registry-lock and must not
+ * access the returned object after releasing the lock.
+ *
+ * Return: Pointer to name-entry, or NULL if not found.
+ */
+struct kdbus_name_entry *
+kdbus_name_lookup_unlocked(struct kdbus_name_registry *reg, const char *name)
+{
+	return kdbus_name_find(reg, kdbus_strhash(name), name);
+}
+
+/**
+ * kdbus_name_acquire() - acquire a name
+ * @reg:		The name registry
+ * @conn:		The connection to pin this entry to
+ * @name:		The name to acquire
+ * @flags:		Acquisition flags (KDBUS_NAME_*)
+ * @return_flags:	Pointer to return flags for the acquired name
+ *			(KDBUS_NAME_*), may be %NULL
+ *
+ * Callers must ensure that @conn is either a privileged bus user or has
+ * sufficient privileges in the policy-db to own the well-known name @name.
+ *
+ * Return: 0 success, negative error number on failure.
+ */
+int kdbus_name_acquire(struct kdbus_name_registry *reg,
+		       struct kdbus_conn *conn, const char *name,
+		       u64 flags, u64 *return_flags)
+{
+	struct kdbus_name_entry *e;
+	u64 rflags = 0;
+	int ret = 0;
+	u32 hash;
+
+	kdbus_conn_assert_active(conn);
+
+	down_write(&reg->rwlock);
+
+	if (!kdbus_conn_policy_own_name(conn, current_cred(), name)) {
+		ret = -EPERM;
+		goto exit_unlock;
+	}
+
+	hash = kdbus_strhash(name);
+	e = kdbus_name_find(reg, hash, name);
+	if (!e) {
+		/* claim new name */
+
+		if (conn->activator_of) {
+			ret = -EINVAL;
+			goto exit_unlock;
+		}
+
+		e = kdbus_name_entry_new(reg, hash, name);
+		if (IS_ERR(e)) {
+			ret = PTR_ERR(e);
+			goto exit_unlock;
+		}
+
+		if (kdbus_conn_is_activator(conn)) {
+			e->activator = kdbus_conn_ref(conn);
+			conn->activator_of = e;
+		}
+
+		kdbus_name_entry_set_owner(e, conn, flags);
+		kdbus_notify_name_change(e->conn->ep->bus, KDBUS_ITEM_NAME_ADD,
+					 0, e->conn->id, 0, e->flags, e->name);
+	} else if (e->conn == conn || e == conn->activator_of) {
+		/* connection already owns that name */
+		ret = -EALREADY;
+	} else if (kdbus_conn_is_activator(conn)) {
+		/* activator claims existing name */
+
+		if (conn->activator_of) {
+			ret = -EINVAL; /* multiple names not allowed */
+		} else if (e->activator) {
+			ret = -EEXIST; /* only one activator per name */
+		} else {
+			e->activator = kdbus_conn_ref(conn);
+			conn->activator_of = e;
+		}
+	} else if (e->flags & KDBUS_NAME_ACTIVATOR) {
+		/* claim name of an activator */
+
+		kdbus_conn_move_messages(conn, e->activator, 0);
+		kdbus_name_entry_replace_owner(e, conn, flags);
+	} else if ((flags & KDBUS_NAME_REPLACE_EXISTING) &&
+		   (e->flags & KDBUS_NAME_ALLOW_REPLACEMENT)) {
+		/* claim name of a previous owner */
+
+		if (e->flags & KDBUS_NAME_QUEUE) {
+			/* move owner back to queue if they asked for it */
+			ret = kdbus_name_pending_new(e, e->conn, e->flags);
+			if (ret < 0)
+				goto exit_unlock;
+		}
+
+		kdbus_name_entry_replace_owner(e, conn, flags);
+	} else if (flags & KDBUS_NAME_QUEUE) {
+		/* add to waiting-queue of the name */
+
+		ret = kdbus_name_pending_new(e, conn, flags);
+		if (ret >= 0)
+			/* tell the caller that we queued it */
+			rflags |= KDBUS_NAME_IN_QUEUE;
+	} else {
+		/* the name is busy, return a failure */
+		ret = -EEXIST;
+	}
+
+	if (ret == 0 && return_flags)
+		*return_flags = rflags;
+
+exit_unlock:
+	up_write(&reg->rwlock);
+	kdbus_notify_flush(conn->ep->bus);
+	return ret;
+}
+
+static void kdbus_name_release_unlocked(struct kdbus_name_registry *reg,
+					struct kdbus_name_entry *e)
+{
+	struct kdbus_name_pending *p;
+
+	lockdep_assert_held(&reg->rwlock);
+
+	p = list_first_entry_or_null(&e->queue, struct kdbus_name_pending,
+				     name_entry);
+
+	if (p) {
+		/* give it to first active waiter in the queue */
+		kdbus_name_entry_replace_owner(e, p->conn, p->flags);
+		kdbus_name_pending_free(p);
+	} else if (e->activator && e->activator != e->conn) {
+		/* hand it back to an active activator connection */
+		kdbus_conn_move_messages(e->activator, e->conn, e->name_id);
+		kdbus_name_entry_replace_owner(e, e->activator,
+					       KDBUS_NAME_ACTIVATOR);
+	} else {
+		/* release the name */
+		kdbus_notify_name_change(e->conn->ep->bus,
+					 KDBUS_ITEM_NAME_REMOVE,
+					 e->conn->id, 0, e->flags, 0, e->name);
+		kdbus_name_entry_remove_owner(e);
+		kdbus_name_entry_free(e);
+	}
+}
+
+static int kdbus_name_release(struct kdbus_name_registry *reg,
+			      struct kdbus_conn *conn,
+			      const char *name)
+{
+	struct kdbus_name_pending *p;
+	struct kdbus_name_entry *e;
+	int ret = 0;
+
+	down_write(&reg->rwlock);
+	e = kdbus_name_find(reg, kdbus_strhash(name), name);
+	if (!e) {
+		ret = -ESRCH;
+	} else if (e->conn == conn) {
+		kdbus_name_release_unlocked(reg, e);
+	} else {
+		ret = -EADDRINUSE;
+		list_for_each_entry(p, &e->queue, name_entry) {
+			if (p->conn == conn) {
+				kdbus_name_pending_free(p);
+				ret = 0;
+				break;
+			}
+		}
+	}
+	up_write(&reg->rwlock);
+
+	kdbus_notify_flush(conn->ep->bus);
+	return ret;
+}
+
+/**
+ * kdbus_name_release_all() - remove all name entries of a given connection
+ * @reg:		name registry
+ * @conn:		connection
+ */
+void kdbus_name_release_all(struct kdbus_name_registry *reg,
+			    struct kdbus_conn *conn)
+{
+	struct kdbus_name_pending *p;
+	struct kdbus_conn *activator = NULL;
+	struct kdbus_name_entry *e;
+
+	down_write(&reg->rwlock);
+
+	if (conn->activator_of) {
+		activator = conn->activator_of->activator;
+		conn->activator_of->activator = NULL;
+	}
+
+	while ((p = list_first_entry_or_null(&conn->names_queue_list,
+					     struct kdbus_name_pending,
+					     conn_entry)))
+		kdbus_name_pending_free(p);
+	while ((e = list_first_entry_or_null(&conn->names_list,
+					     struct kdbus_name_entry,
+					     conn_entry)))
+		kdbus_name_release_unlocked(reg, e);
+
+	up_write(&reg->rwlock);
+
+	kdbus_conn_unref(activator);
+	kdbus_notify_flush(conn->ep->bus);
+}
+
+/**
+ * kdbus_cmd_name_acquire() - handle KDBUS_CMD_NAME_ACQUIRE
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_name_acquire(struct kdbus_conn *conn, void __user *argp)
+{
+	const char *item_name;
+	struct kdbus_cmd *cmd;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_NAME, .mandatory = true },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE |
+				 KDBUS_NAME_REPLACE_EXISTING |
+				 KDBUS_NAME_ALLOW_REPLACEMENT |
+				 KDBUS_NAME_QUEUE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	if (!kdbus_conn_is_ordinary(conn))
+		return -EOPNOTSUPP;
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	item_name = argv[1].item->str;
+	if (!kdbus_name_is_valid(item_name, false)) {
+		ret = -EINVAL;
+		goto exit;
+	}
+
+	/*
+	 * Do atomic_inc_return here to reserve our slot, then decrement
+	 * it before returning.
+	 */
+	if (atomic_inc_return(&conn->name_count) > KDBUS_CONN_MAX_NAMES) {
+		ret = -E2BIG;
+		goto exit_dec;
+	}
+
+	ret = kdbus_name_acquire(conn->ep->bus->name_registry, conn, item_name,
+				 cmd->flags, &cmd->return_flags);
+
+exit_dec:
+	atomic_dec(&conn->name_count);
+exit:
+	return kdbus_args_clear(&args, ret);
+}
+
+/**
+ * kdbus_cmd_name_release() - handle KDBUS_CMD_NAME_RELEASE
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_name_release(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_cmd *cmd;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+		{ .type = KDBUS_ITEM_NAME, .mandatory = true },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	if (!kdbus_conn_is_ordinary(conn))
+		return -EOPNOTSUPP;
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	ret = kdbus_name_release(conn->ep->bus->name_registry, conn,
+				 argv[1].item->str);
+	return kdbus_args_clear(&args, ret);
+}
+
+static int kdbus_list_write(struct kdbus_conn *conn,
+			    struct kdbus_conn *c,
+			    struct kdbus_pool_slice *slice,
+			    size_t *pos,
+			    struct kdbus_name_entry *e,
+			    bool write)
+{
+	struct kvec kvec[4];
+	size_t cnt = 0;
+	int ret;
+
+	/* info header */
+	struct kdbus_info info = {
+		.size = 0,
+		.id = c->id,
+		.flags = c->flags,
+	};
+
+	/* fake the header of a kdbus_name item */
+	struct {
+		u64 size;
+		u64 type;
+		u64 flags;
+	} h = {};
+
+	if (e && !kdbus_conn_policy_see_name_unlocked(conn, current_cred(),
+						      e->name))
+		return 0;
+
+	kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &info.size);
+
+	/* append name */
+	if (e) {
+		size_t slen = strlen(e->name) + 1;
+
+		h.size = offsetof(struct kdbus_item, name.name) + slen;
+		h.type = KDBUS_ITEM_OWNED_NAME;
+		h.flags = e->flags;
+
+		kdbus_kvec_set(&kvec[cnt++], &h, sizeof(h), &info.size);
+		kdbus_kvec_set(&kvec[cnt++], e->name, slen, &info.size);
+		cnt += !!kdbus_kvec_pad(&kvec[cnt], &info.size);
+	}
+
+	if (write) {
+		ret = kdbus_pool_slice_copy_kvec(slice, *pos, kvec,
+						 cnt, info.size);
+		if (ret < 0)
+			return ret;
+	}
+
+	*pos += info.size;
+	return 0;
+}
+
+static int kdbus_list_all(struct kdbus_conn *conn, u64 flags,
+			  struct kdbus_pool_slice *slice,
+			  size_t *pos, bool write)
+{
+	struct kdbus_conn *c;
+	size_t p = *pos;
+	int ret, i;
+
+	hash_for_each(conn->ep->bus->conn_hash, i, c, hentry) {
+		bool added = false;
+
+		/* skip monitors */
+		if (kdbus_conn_is_monitor(c))
+			continue;
+
+		/* skip activators */
+		if (!(flags & KDBUS_LIST_ACTIVATORS) &&
+		    kdbus_conn_is_activator(c))
+			continue;
+
+		/* all names the connection owns */
+		if (flags & (KDBUS_LIST_NAMES | KDBUS_LIST_ACTIVATORS)) {
+			struct kdbus_name_entry *e;
+
+			list_for_each_entry(e, &c->names_list, conn_entry) {
+				struct kdbus_conn *a = e->activator;
+
+				if ((flags & KDBUS_LIST_ACTIVATORS) &&
+				    a && a != c) {
+					ret = kdbus_list_write(conn, a, slice,
+							       &p, e, write);
+					if (ret < 0) {
+						mutex_unlock(&c->lock);
+						return ret;
+					}
+
+					added = true;
+				}
+
+				if (flags & KDBUS_LIST_NAMES ||
+				    kdbus_conn_is_activator(c)) {
+					ret = kdbus_list_write(conn, c, slice,
+							       &p, e, write);
+					if (ret < 0) {
+						mutex_unlock(&c->lock);
+						return ret;
+					}
+
+					added = true;
+				}
+			}
+		}
+
+		/* queue of names the connection is currently waiting for */
+		if (flags & KDBUS_LIST_QUEUED) {
+			struct kdbus_name_pending *q;
+
+			list_for_each_entry(q, &c->names_queue_list,
+					    conn_entry) {
+				ret = kdbus_list_write(conn, c, slice, &p,
+						       q->name, write);
+				if (ret < 0) {
+					mutex_unlock(&c->lock);
+					return ret;
+				}
+
+				added = true;
+			}
+		}
+
+		/* nothing added so far, just add the unique ID */
+		if (!added && flags & KDBUS_LIST_UNIQUE) {
+			ret = kdbus_list_write(conn, c, slice, &p, NULL, write);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	*pos = p;
+	return 0;
+}
+
+/**
+ * kdbus_cmd_list() - handle KDBUS_CMD_LIST
+ * @conn:		connection to operate on
+ * @argp:		command payload
+ *
+ * Return: >=0 on success, negative error code on failure.
+ */
+int kdbus_cmd_list(struct kdbus_conn *conn, void __user *argp)
+{
+	struct kdbus_name_registry *reg = conn->ep->bus->name_registry;
+	struct kdbus_pool_slice *slice = NULL;
+	struct kdbus_cmd_list *cmd;
+	size_t pos, size;
+	int ret;
+
+	struct kdbus_arg argv[] = {
+		{ .type = KDBUS_ITEM_NEGOTIATE },
+	};
+	struct kdbus_args args = {
+		.allowed_flags = KDBUS_FLAG_NEGOTIATE |
+				 KDBUS_LIST_UNIQUE |
+				 KDBUS_LIST_NAMES |
+				 KDBUS_LIST_ACTIVATORS |
+				 KDBUS_LIST_QUEUED,
+		.argv = argv,
+		.argc = ARRAY_SIZE(argv),
+	};
+
+	ret = kdbus_args_parse(&args, argp, &cmd);
+	if (ret != 0)
+		return ret;
+
+	/* lock order: domain -> bus -> ep -> names -> conn */
+	down_read(&reg->rwlock);
+	down_read(&conn->ep->bus->conn_rwlock);
+	down_read(&conn->ep->policy_db.entries_rwlock);
+
+	/* size of records */
+	size = 0;
+	ret = kdbus_list_all(conn, cmd->flags, NULL, &size, false);
+	if (ret < 0)
+		goto exit_unlock;
+
+	if (size == 0) {
+		kdbus_pool_publish_empty(conn->pool, &cmd->offset,
+					 &cmd->list_size);
+	} else {
+		slice = kdbus_pool_slice_alloc(conn->pool, size, false);
+		if (IS_ERR(slice)) {
+			ret = PTR_ERR(slice);
+			slice = NULL;
+			goto exit_unlock;
+		}
+
+		/* copy the records */
+		pos = 0;
+		ret = kdbus_list_all(conn, cmd->flags, slice, &pos, true);
+		if (ret < 0)
+			goto exit_unlock;
+
+		WARN_ON(pos != size);
+		kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->list_size);
+	}
+
+	if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) ||
+	    kdbus_member_set_user(&cmd->list_size, argp,
+				  typeof(*cmd), list_size))
+		ret = -EFAULT;
+
+exit_unlock:
+	up_read(&conn->ep->policy_db.entries_rwlock);
+	up_read(&conn->ep->bus->conn_rwlock);
+	up_read(&reg->rwlock);
+	kdbus_pool_slice_release(slice);
+	return kdbus_args_clear(&args, ret);
+}
diff --git a/ipc/kdbus/names.h b/ipc/kdbus/names.h
new file mode 100644
index 000000000..3dd258929
--- /dev/null
+++ b/ipc/kdbus/names.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_NAMES_H
+#define __KDBUS_NAMES_H
+
+#include <linux/hashtable.h>
+#include <linux/rwsem.h>
+
+/**
+ * struct kdbus_name_registry - names registered for a bus
+ * @entries_hash:	Map of entries
+ * @lock:		Registry data lock
+ * @name_seq_last:	Last used sequence number to assign to a name entry
+ */
+struct kdbus_name_registry {
+	DECLARE_HASHTABLE(entries_hash, 8);
+	struct rw_semaphore rwlock;
+	u64 name_seq_last;
+};
+
+/**
+ * struct kdbus_name_entry - well-know name entry
+ * @name_id:		Sequence number of name entry to be able to uniquely
+ *			identify a name over its registration lifetime
+ * @flags:		KDBUS_NAME_* flags
+ * @conn:		Connection owning the name
+ * @activator:		Connection of the activator queuing incoming messages
+ * @queue:		List of queued connections
+ * @conn_entry:		Entry in connection
+ * @hentry:		Entry in registry map
+ * @name:		The well-known name
+ */
+struct kdbus_name_entry {
+	u64 name_id;
+	u64 flags;
+	struct kdbus_conn *conn;
+	struct kdbus_conn *activator;
+	struct list_head queue;
+	struct list_head conn_entry;
+	struct hlist_node hentry;
+	char name[];
+};
+
+bool kdbus_name_is_valid(const char *p, bool allow_wildcard);
+
+struct kdbus_name_registry *kdbus_name_registry_new(void);
+void kdbus_name_registry_free(struct kdbus_name_registry *reg);
+
+struct kdbus_name_entry *
+kdbus_name_lookup_unlocked(struct kdbus_name_registry *reg, const char *name);
+
+int kdbus_name_acquire(struct kdbus_name_registry *reg,
+		       struct kdbus_conn *conn, const char *name,
+		       u64 flags, u64 *return_flags);
+void kdbus_name_release_all(struct kdbus_name_registry *reg,
+			    struct kdbus_conn *conn);
+
+int kdbus_cmd_name_acquire(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_name_release(struct kdbus_conn *conn, void __user *argp);
+int kdbus_cmd_list(struct kdbus_conn *conn, void __user *argp);
+
+#endif
diff --git a/ipc/kdbus/node.c b/ipc/kdbus/node.c
new file mode 100644
index 000000000..89f58bc85
--- /dev/null
+++ b/ipc/kdbus/node.c
@@ -0,0 +1,897 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/atomic.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/kdev_t.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+
+#include "bus.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "fs.h"
+#include "handle.h"
+#include "node.h"
+#include "util.h"
+
+/**
+ * DOC: kdbus nodes
+ *
+ * Nodes unify lifetime management across exposed kdbus objects and provide a
+ * hierarchy. Each kdbus object, that might be exposed to user-space, has a
+ * kdbus_node object embedded and is linked into the hierarchy. Each node can
+ * have any number (0-n) of child nodes linked. Each child retains a reference
+ * to its parent node. For root-nodes, the parent is NULL.
+ *
+ * Each node object goes through a bunch of states during it's lifetime:
+ *     * NEW
+ *       * LINKED    (can be skipped by NEW->FREED transition)
+ *         * ACTIVE  (can be skipped by LINKED->INACTIVE transition)
+ *       * INACTIVE
+ *       * DRAINED
+ *     * FREED
+ *
+ * Each node is allocated by the caller and initialized via kdbus_node_init().
+ * This never fails and sets the object into state NEW. From now on, ref-counts
+ * on the node manage its lifetime. During init, the ref-count is set to 1. Once
+ * it drops to 0, the node goes to state FREED and the node->free_cb() callback
+ * is called to deallocate any memory.
+ *
+ * After initializing a node, you usually link it into the hierarchy. You need
+ * to provide a parent node and a name. The node will be linked as child to the
+ * parent and a globally unique ID is assigned to the child. The name of the
+ * child must be unique for all children of this parent. Otherwise, linking the
+ * child will fail with -EEXIST.
+ * Note that the child is not marked active, yet. Admittedly, it prevents any
+ * other node from being linked with the same name (thus, it reserves that
+ * name), but any child-lookup (via name or unique ID) will never return this
+ * child unless it has been marked active.
+ *
+ * Once successfully linked, you can use kdbus_node_activate() to activate a
+ * child. This will mark the child active. This state can be skipped by directly
+ * deactivating the child via kdbus_node_deactivate() (see below).
+ * By activating a child, you enable any lookups on this child to succeed from
+ * now on. Furthermore, any code that got its hands on a reference to the node,
+ * can from now on "acquire" the node.
+ *
+ *     Active References (or: 'acquiring' and 'releasing' a node)
+ *     Additionally to normal object references, nodes support something we call
+ *     "active references". An active reference can be acquired via
+ *     kdbus_node_acquire() and released via kdbus_node_release(). A caller
+ *     _must_ own a normal object reference whenever calling those functions.
+ *     Unlike object references, acquiring an active reference can fail (by
+ *     returning 'false' from kdbus_node_acquire()). An active reference can
+ *     only be acquired if the node is marked active. If it is not marked
+ *     active, yet, or if it was already deactivated, no more active references
+ *     can be acquired, ever!
+ *     Active references are used to track tasks working on a node. Whenever a
+ *     task enters kernel-space to perform an action on a node, it acquires an
+ *     active reference, performs the action and releases the reference again.
+ *     While holding an active reference, the node is guaranteed to stay active.
+ *     If the node is deactivated in parallel, the node is marked as
+ *     deactivated, then we wait for all active references to be dropped, before
+ *     we finally proceed with any cleanups. That is, if you hold an active
+ *     reference to a node, any resources that are bound to the "active" state
+ *     are guaranteed to stay accessible until you release your reference.
+ *
+ *     Active-references are very similar to rw-locks, where acquiring a node is
+ *     equal to try-read-lock and releasing to read-unlock. Deactivating a node
+ *     means write-lock and never releasing it again.
+ *     Unlike rw-locks, the 'active reference' concept is more versatile and
+ *     avoids unusual rw-lock usage (never releasing a write-lock..).
+ *
+ *     It is safe to acquire multiple active-references recursively. But you
+ *     need to check the return value of kdbus_node_acquire() on _each_ call. It
+ *     may stop granting references at _any_ time.
+ *
+ *     You're free to perform any operations you want while holding an active
+ *     reference, except sleeping for an indefinite period. Sleeping for a fixed
+ *     amount of time is fine, but you usually should not wait on wait-queues
+ *     without a timeout.
+ *     For example, if you wait for I/O to happen, you should gather all data
+ *     and schedule the I/O operation, then release your active reference and
+ *     wait for it to complete. Then try to acquire a new reference. If it
+ *     fails, perform any cleanup (the node is now dead). Otherwise, you can
+ *     finish your operation.
+ *
+ * All nodes can be deactivated via kdbus_node_deactivate() at any time. You can
+ * call this multiple times, even in parallel or on nodes that were never
+ * linked, and it will just work. The only restriction is, you must not hold an
+ * active reference when calling kdbus_node_deactivate().
+ * By deactivating a node, it is immediately marked inactive. Then, we wait for
+ * all active references to be released (called 'draining' the node). This
+ * shouldn't take very long as we don't perform long-lasting operations while
+ * holding an active reference. Note that once the node is marked inactive, no
+ * new active references can be acquired.
+ * Once all active references are dropped, the node is considered 'drained'. Now
+ * kdbus_node_deactivate() is called on each child of the node before we
+ * continue deactivating our node. That is, once all children are entirely
+ * deactivated, we call ->release_cb() of our node. ->release_cb() can release
+ * any resources on that node which are bound to the "active" state of a node.
+ * When done, we unlink the node from its parent rb-tree, mark it as
+ * 'released' and return.
+ * If kdbus_node_deactivate() is called multiple times (even in parallel), all
+ * but one caller will just wait until the node is fully deactivated. That is,
+ * one random caller of kdbus_node_deactivate() is selected to call
+ * ->release_cb() and cleanup the node. Only once all this is done, all other
+ * callers will return from kdbus_node_deactivate(). That is, it doesn't matter
+ * whether you're the selected caller or not, it will only return after
+ * everything is fully done.
+ *
+ * When a node is activated, we acquire a normal object reference to the node.
+ * This reference is dropped after deactivation is fully done (and only iff the
+ * node really was activated). This allows callers to link+activate a child node
+ * and then drop all refs. The node will be deactivated together with the
+ * parent, and then be freed when this reference is dropped.
+ *
+ * Currently, nodes provide a bunch of resources that external code can use
+ * directly. This includes:
+ *
+ *     * node->waitq: Each node has its own wait-queue that is used to manage
+ *                    the 'active' state. When a node is deactivated, we wait on
+ *                    this queue until all active refs are dropped. Analogously,
+ *                    when you release an active reference on a deactivated
+ *                    node, and the active ref-count drops to 0, we wake up a
+ *                    single thread on this queue. Furthermore, once the
+ *                    ->release_cb() callback finished, we wake up all waiters.
+ *                    The node-owner is free to re-use this wait-queue for other
+ *                    purposes. As node-management uses this queue only during
+ *                    deactivation, it is usually totally fine to re-use the
+ *                    queue for other, preferably low-overhead, use-cases.
+ *
+ *     * node->type: This field defines the type of the owner of this node. It
+ *                   must be set during node initialization and must remain
+ *                   constant. The node management never looks at this value,
+ *                   but external users might use to gain access to the owner
+ *                   object of a node.
+ *                   It is totally up to the owner of the node to define what
+ *                   their type means. Usually it means you can access the
+ *                   parent structure via container_of(), as long as you hold an
+ *                   active reference to the node.
+ *
+ *     * node->free_cb:    callback after all references are dropped
+ *       node->release_cb: callback during node deactivation
+ *                         These fields must be set by the node owner during
+ *                         node initialization. They must remain constant. If
+ *                         NULL, they're skipped.
+ *
+ *     * node->mode: filesystem access modes
+ *       node->uid:  filesystem owner uid
+ *       node->gid:  filesystem owner gid
+ *                   These fields must be set by the node owner during node
+ *                   initialization. They must remain constant and may be
+ *                   accessed by other callers to properly initialize
+ *                   filesystem nodes.
+ *
+ *     * node->id: This is an unsigned 32bit integer allocated by an IDA. It is
+ *                 always kept as small as possible during allocation and is
+ *                 globally unique across all nodes allocated by this module. 0
+ *                 is reserved as "not assigned" and is the default.
+ *                 The ID is assigned during kdbus_node_link() and is kept until
+ *                 the object is freed. Thus, the ID surpasses the active
+ *                 lifetime of a node. As long as you hold an object reference
+ *                 to a node (and the node was linked once), the ID is valid and
+ *                 unique.
+ *
+ *     * node->name: name of this node
+ *       node->hash: 31bit hash-value of @name (range [2..INT_MAX-1])
+ *                   These values follow the same lifetime rules as node->id.
+ *                   They're initialized when the node is linked and then remain
+ *                   constant until the last object reference is dropped.
+ *                   Unlike the id, the name is only unique across all siblings
+ *                   and only until the node is deactivated. Currently, the name
+ *                   is even unique if linked but not activated, yet. This might
+ *                   change in the future, though. Code should not rely on this.
+ *
+ *     * node->lock:     lock to protect node->children, node->rb, node->parent
+ *     * node->parent: Reference to parent node. This is set during LINK time
+ *                     and is dropped during destruction. You must not access
+ *                     it unless you hold an active reference to the node or if
+ *                     you know the node is dead.
+ *     * node->children: rb-tree of all linked children of this node. You must
+ *                       not access this directly, but use one of the iterator
+ *                       or lookup helpers.
+ */
+
+/*
+ * Bias values track states of "active references". They're all negative. If a
+ * node is active, its active-ref-counter is >=0 and tracks all active
+ * references. Once a node is deactivaed, we subtract NODE_BIAS. This means, the
+ * counter is now negative but still counts the active references. Once it drops
+ * to exactly NODE_BIAS, we know all active references were dropped. Exactly one
+ * thread will change it to NODE_RELEASE now, perform cleanup and then put it
+ * into NODE_DRAINED. Once drained, all other threads that tried deactivating
+ * the node will now be woken up (thus, they wait until the node is fully done).
+ * The initial state during node-setup is NODE_NEW. If a node is directly
+ * deactivated without having ever been active, it is put into
+ * NODE_RELEASE_DIRECT instead of NODE_BIAS. This tracks this one-bit state
+ * across node-deactivation. The task putting it into NODE_RELEASE now knows
+ * whether the node was active before or not.
+ *
+ * Some archs implement atomic_sub(v) with atomic_add(-v), so reserve INT_MIN
+ * to avoid overflows if multiplied by -1.
+ */
+#define KDBUS_NODE_BIAS			(INT_MIN + 5)
+#define KDBUS_NODE_RELEASE_DIRECT	(KDBUS_NODE_BIAS - 1)
+#define KDBUS_NODE_RELEASE		(KDBUS_NODE_BIAS - 2)
+#define KDBUS_NODE_DRAINED		(KDBUS_NODE_BIAS - 3)
+#define KDBUS_NODE_NEW			(KDBUS_NODE_BIAS - 4)
+
+/* global unique ID mapping for kdbus nodes */
+DEFINE_IDA(kdbus_node_ida);
+
+/**
+ * kdbus_node_name_hash() - hash a name
+ * @name:	The string to hash
+ *
+ * This computes the hash of @name. It is guaranteed to be in the range
+ * [2..INT_MAX-1]. The values 1, 2 and INT_MAX are unused as they are reserved
+ * for the filesystem code.
+ *
+ * Return: hash value of the passed string
+ */
+static unsigned int kdbus_node_name_hash(const char *name)
+{
+	unsigned int hash;
+
+	/* reserve hash numbers 0, 1 and >=INT_MAX for magic directories */
+	hash = kdbus_strhash(name) & INT_MAX;
+	if (hash < 2)
+		hash += 2;
+	if (hash >= INT_MAX)
+		hash = INT_MAX - 1;
+
+	return hash;
+}
+
+/**
+ * kdbus_node_name_compare() - compare a name with a node's name
+ * @hash:	hash of the string to compare the node with
+ * @name:	name to compare the node with
+ * @node:	node to compare the name with
+ *
+ * Return: 0 if @name and @hash exactly match the information in @node, or
+ * an integer less than or greater than zero if @name is found, respectively,
+ * to be less than or be greater than the string stored in @node.
+ */
+static int kdbus_node_name_compare(unsigned int hash, const char *name,
+				   const struct kdbus_node *node)
+{
+	if (hash != node->hash)
+		return hash - node->hash;
+
+	return strcmp(name, node->name);
+}
+
+/**
+ * kdbus_node_init() - initialize a kdbus_node
+ * @node:	Pointer to the node to initialize
+ * @type:	The type the node will have (KDBUS_NODE_*)
+ *
+ * The caller is responsible of allocating @node and initializating it to zero.
+ * Once this call returns, you must use the node_ref() and node_unref()
+ * functions to manage this node.
+ */
+void kdbus_node_init(struct kdbus_node *node, unsigned int type)
+{
+	atomic_set(&node->refcnt, 1);
+	mutex_init(&node->lock);
+	node->id = 0;
+	node->type = type;
+	RB_CLEAR_NODE(&node->rb);
+	node->children = RB_ROOT;
+	init_waitqueue_head(&node->waitq);
+	atomic_set(&node->active, KDBUS_NODE_NEW);
+}
+
+/**
+ * kdbus_node_link() - link a node into the nodes system
+ * @node:	Pointer to the node to initialize
+ * @parent:	Pointer to a parent node, may be %NULL
+ * @name:	The name of the node (or NULL if root node)
+ *
+ * This links a node into the hierarchy. This must not be called multiple times.
+ * If @parent is NULL, the node becomes a new root node.
+ *
+ * This call will fail if @name is not unique across all its siblings or if no
+ * ID could be allocated. You must not activate a node if linking failed! It is
+ * safe to deactivate it, though.
+ *
+ * Once you linked a node, you must call kdbus_node_deactivate() before you drop
+ * the last reference (even if you never activate the node).
+ *
+ * Return: 0 on success. negative error otherwise.
+ */
+int kdbus_node_link(struct kdbus_node *node, struct kdbus_node *parent,
+		    const char *name)
+{
+	int ret;
+
+	if (WARN_ON(node->type != KDBUS_NODE_DOMAIN && !parent))
+		return -EINVAL;
+
+	if (WARN_ON(parent && !name))
+		return -EINVAL;
+
+	if (name) {
+		node->name = kstrdup(name, GFP_KERNEL);
+		if (!node->name)
+			return -ENOMEM;
+
+		node->hash = kdbus_node_name_hash(name);
+	}
+
+	ret = ida_simple_get(&kdbus_node_ida, 1, 0, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+
+	node->id = ret;
+	ret = 0;
+
+	if (parent) {
+		struct rb_node **n, *prev;
+
+		if (!kdbus_node_acquire(parent))
+			return -ESHUTDOWN;
+
+		mutex_lock(&parent->lock);
+
+		n = &parent->children.rb_node;
+		prev = NULL;
+
+		while (*n) {
+			struct kdbus_node *pos;
+			int result;
+
+			pos = kdbus_node_from_rb(*n);
+			prev = *n;
+			result = kdbus_node_name_compare(node->hash,
+							 node->name,
+							 pos);
+			if (result == 0) {
+				ret = -EEXIST;
+				goto exit_unlock;
+			}
+
+			if (result < 0)
+				n = &pos->rb.rb_left;
+			else
+				n = &pos->rb.rb_right;
+		}
+
+		/* add new node and rebalance the tree */
+		rb_link_node(&node->rb, prev, n);
+		rb_insert_color(&node->rb, &parent->children);
+		node->parent = kdbus_node_ref(parent);
+
+exit_unlock:
+		mutex_unlock(&parent->lock);
+		kdbus_node_release(parent);
+	}
+
+	return ret;
+}
+
+/**
+ * kdbus_node_ref() - Acquire object reference
+ * @node:	node to acquire reference to (or NULL)
+ *
+ * This acquires a new reference to @node. You must already own a reference when
+ * calling this!
+ * If @node is NULL, this is a no-op.
+ *
+ * Return: @node is returned
+ */
+struct kdbus_node *kdbus_node_ref(struct kdbus_node *node)
+{
+	if (node)
+		atomic_inc(&node->refcnt);
+	return node;
+}
+
+/**
+ * kdbus_node_unref() - Drop object reference
+ * @node:	node to drop reference to (or NULL)
+ *
+ * This drops an object reference to @node. You must not access the node if you
+ * no longer own a reference.
+ * If the ref-count drops to 0, the object will be destroyed (->free_cb will be
+ * called).
+ *
+ * If you linked or activated the node, you must deactivate the node before you
+ * drop your last reference! If you didn't link or activate the node, you can
+ * drop any reference you want.
+ *
+ * Note that this calls into ->free_cb() and thus _might_ sleep. The ->free_cb()
+ * callbacks must not acquire any outer locks, though. So you can safely drop
+ * references while holding locks.
+ *
+ * If @node is NULL, this is a no-op.
+ *
+ * Return: This always returns NULL
+ */
+struct kdbus_node *kdbus_node_unref(struct kdbus_node *node)
+{
+	if (node && atomic_dec_and_test(&node->refcnt)) {
+		struct kdbus_node safe = *node;
+
+		WARN_ON(atomic_read(&node->active) != KDBUS_NODE_DRAINED);
+		WARN_ON(!RB_EMPTY_NODE(&node->rb));
+
+		if (node->free_cb)
+			node->free_cb(node);
+		if (safe.id > 0)
+			ida_simple_remove(&kdbus_node_ida, safe.id);
+
+		kfree(safe.name);
+
+		/*
+		 * kdbusfs relies on the parent to be available even after the
+		 * node was deactivated and unlinked. Therefore, we pin it
+		 * until a node is destroyed.
+		 */
+		kdbus_node_unref(safe.parent);
+	}
+
+	return NULL;
+}
+
+/**
+ * kdbus_node_is_active() - test whether a node is active
+ * @node:	node to test
+ *
+ * This checks whether @node is active. That means, @node was linked and
+ * activated by the node owner and hasn't been deactivated, yet. If, and only
+ * if, a node is active, kdbus_node_acquire() will be able to acquire active
+ * references.
+ *
+ * Note that this function does not give any lifetime guarantees. After this
+ * call returns, the node might be deactivated immediately. Normally, what you
+ * want is to acquire a real active reference via kdbus_node_acquire().
+ *
+ * Return: true if @node is active, false otherwise
+ */
+bool kdbus_node_is_active(struct kdbus_node *node)
+{
+	return atomic_read(&node->active) >= 0;
+}
+
+/**
+ * kdbus_node_is_deactivated() - test whether a node was already deactivated
+ * @node:	node to test
+ *
+ * This checks whether kdbus_node_deactivate() was called on @node. Note that
+ * this might be true even if you never deactivated the node directly, but only
+ * one of its ancestors.
+ *
+ * Note that even if this returns 'false', the node might get deactivated
+ * immediately after the call returns.
+ *
+ * Return: true if @node was already deactivated, false if not
+ */
+bool kdbus_node_is_deactivated(struct kdbus_node *node)
+{
+	int v;
+
+	v = atomic_read(&node->active);
+	return v != KDBUS_NODE_NEW && v < 0;
+}
+
+/**
+ * kdbus_node_activate() - activate a node
+ * @node:	node to activate
+ *
+ * This marks @node as active if, and only if, the node wasn't activated nor
+ * deactivated, yet, and the parent is still active. Any but the first call to
+ * kdbus_node_activate() is a no-op.
+ * If you called kdbus_node_deactivate() before, then even the first call to
+ * kdbus_node_activate() will be a no-op.
+ *
+ * This call doesn't give any lifetime guarantees. The node might get
+ * deactivated immediately after this call returns. Or the parent might already
+ * be deactivated, which will make this call a no-op.
+ *
+ * If this call successfully activated a node, it will take an object reference
+ * to it. This reference is dropped after the node is deactivated. Therefore,
+ * the object owner can safely drop their reference to @node iff they know that
+ * its parent node will get deactivated at some point. Once the parent node is
+ * deactivated, it will deactivate all its child and thus drop this reference
+ * again.
+ *
+ * Return: True if this call successfully activated the node, otherwise false.
+ *         Note that this might return false, even if the node is still active
+ *         (eg., if you called this a second time).
+ */
+bool kdbus_node_activate(struct kdbus_node *node)
+{
+	bool res = false;
+
+	mutex_lock(&node->lock);
+	if (atomic_read(&node->active) == KDBUS_NODE_NEW) {
+		atomic_sub(KDBUS_NODE_NEW, &node->active);
+		/* activated nodes have ref +1 */
+		kdbus_node_ref(node);
+		res = true;
+	}
+	mutex_unlock(&node->lock);
+
+	return res;
+}
+
+/**
+ * kdbus_node_deactivate() - deactivate a node
+ * @node:	The node to deactivate.
+ *
+ * This function recursively deactivates this node and all its children. It
+ * returns only once all children and the node itself were recursively disabled
+ * (even if you call this function multiple times in parallel).
+ *
+ * It is safe to call this function on _any_ node that was initialized _any_
+ * number of times.
+ *
+ * This call may sleep, as it waits for all active references to be dropped.
+ */
+void kdbus_node_deactivate(struct kdbus_node *node)
+{
+	struct kdbus_node *pos, *child;
+	struct rb_node *rb;
+	int v_pre, v_post;
+
+	pos = node;
+
+	/*
+	 * To avoid recursion, we perform back-tracking while deactivating
+	 * nodes. For each node we enter, we first mark the active-counter as
+	 * deactivated by adding BIAS. If the node as children, we set the first
+	 * child as current position and start over. If the node has no
+	 * children, we drain the node by waiting for all active refs to be
+	 * dropped and then releasing the node.
+	 *
+	 * After the node is released, we set its parent as current position
+	 * and start over. If the current position was the initial node, we're
+	 * done.
+	 *
+	 * Note that this function can be called in parallel by multiple
+	 * callers. We make sure that each node is only released once, and any
+	 * racing caller will wait until the other thread fully released that
+	 * node.
+	 */
+
+	for (;;) {
+		/*
+		 * Add BIAS to node->active to mark it as inactive. If it was
+		 * never active before, immediately mark it as RELEASE_INACTIVE
+		 * so we remember this state.
+		 * We cannot remember v_pre as we might iterate into the
+		 * children, overwriting v_pre, before we can release our node.
+		 */
+		mutex_lock(&pos->lock);
+		v_pre = atomic_read(&pos->active);
+		if (v_pre >= 0)
+			atomic_add_return(KDBUS_NODE_BIAS, &pos->active);
+		else if (v_pre == KDBUS_NODE_NEW)
+			atomic_set(&pos->active, KDBUS_NODE_RELEASE_DIRECT);
+		mutex_unlock(&pos->lock);
+
+		/* wait until all active references were dropped */
+		wait_event(pos->waitq,
+			   atomic_read(&pos->active) <= KDBUS_NODE_BIAS);
+
+		mutex_lock(&pos->lock);
+		/* recurse into first child if any */
+		rb = rb_first(&pos->children);
+		if (rb) {
+			child = kdbus_node_ref(kdbus_node_from_rb(rb));
+			mutex_unlock(&pos->lock);
+			pos = child;
+			continue;
+		}
+
+		/* mark object as RELEASE */
+		v_post = atomic_read(&pos->active);
+		if (v_post == KDBUS_NODE_BIAS ||
+		    v_post == KDBUS_NODE_RELEASE_DIRECT)
+			atomic_set(&pos->active, KDBUS_NODE_RELEASE);
+		mutex_unlock(&pos->lock);
+
+		/*
+		 * If this is the thread that marked the object as RELEASE, we
+		 * perform the actual release. Otherwise, we wait until the
+		 * release is done and the node is marked as DRAINED.
+		 */
+		if (v_post == KDBUS_NODE_BIAS ||
+		    v_post == KDBUS_NODE_RELEASE_DIRECT) {
+			if (pos->release_cb)
+				pos->release_cb(pos, v_post == KDBUS_NODE_BIAS);
+
+			if (pos->parent) {
+				mutex_lock(&pos->parent->lock);
+				if (!RB_EMPTY_NODE(&pos->rb)) {
+					rb_erase(&pos->rb,
+						 &pos->parent->children);
+					RB_CLEAR_NODE(&pos->rb);
+				}
+				mutex_unlock(&pos->parent->lock);
+			}
+
+			/* mark as DRAINED */
+			atomic_set(&pos->active, KDBUS_NODE_DRAINED);
+			wake_up_all(&pos->waitq);
+
+			/* drop VFS cache */
+			kdbus_fs_flush(pos);
+
+			/*
+			 * If the node was activated and someone subtracted BIAS
+			 * from it to deactivate it, we, and only us, are
+			 * responsible to release the extra ref-count that was
+			 * taken once in kdbus_node_activate().
+			 * If the node was never activated, no-one ever
+			 * subtracted BIAS, but instead skipped that state and
+			 * immediately went to NODE_RELEASE_DIRECT. In that case
+			 * we must not drop the reference.
+			 */
+			if (v_post == KDBUS_NODE_BIAS)
+				kdbus_node_unref(pos);
+		} else {
+			/* wait until object is DRAINED */
+			wait_event(pos->waitq,
+			    atomic_read(&pos->active) == KDBUS_NODE_DRAINED);
+		}
+
+		/*
+		 * We're done with the current node. Continue on its parent
+		 * again, which will try deactivating its next child, or itself
+		 * if no child is left.
+		 * If we've reached our initial node again, we are done and
+		 * can safely return.
+		 */
+		if (pos == node)
+			break;
+
+		child = pos;
+		pos = pos->parent;
+		kdbus_node_unref(child);
+	}
+}
+
+/**
+ * kdbus_node_acquire() - Acquire an active ref on a node
+ * @node:	The node
+ *
+ * This acquires an active-reference to @node. This will only succeed if the
+ * node is active. You must release this active reference via
+ * kdbus_node_release() again.
+ *
+ * See the introduction to "active references" for more details.
+ *
+ * Return: %true if @node was non-NULL and active
+ */
+bool kdbus_node_acquire(struct kdbus_node *node)
+{
+	return node && atomic_inc_unless_negative(&node->active);
+}
+
+/**
+ * kdbus_node_release() - Release an active ref on a node
+ * @node:	The node
+ *
+ * This releases an active reference that was previously acquired via
+ * kdbus_node_acquire(). See kdbus_node_acquire() for details.
+ */
+void kdbus_node_release(struct kdbus_node *node)
+{
+	if (node && atomic_dec_return(&node->active) == KDBUS_NODE_BIAS)
+		wake_up(&node->waitq);
+}
+
+/**
+ * kdbus_node_find_child() - Find child by name
+ * @node:	parent node to search through
+ * @name:	name of child node
+ *
+ * This searches through all children of @node for a child-node with name @name.
+ * If not found, or if the child is deactivated, NULL is returned. Otherwise,
+ * the child is acquired and a new reference is returned.
+ *
+ * If you're done with the child, you need to release it and drop your
+ * reference.
+ *
+ * This function does not acquire the parent node. However, if the parent was
+ * already deactivated, then kdbus_node_deactivate() will, at some point, also
+ * deactivate the child. Therefore, we can rely on the explicit ordering during
+ * deactivation.
+ *
+ * Return: Reference to acquired child node, or NULL if not found / not active.
+ */
+struct kdbus_node *kdbus_node_find_child(struct kdbus_node *node,
+					 const char *name)
+{
+	struct kdbus_node *child;
+	struct rb_node *rb;
+	unsigned int hash;
+	int ret;
+
+	hash = kdbus_node_name_hash(name);
+
+	mutex_lock(&node->lock);
+	rb = node->children.rb_node;
+	while (rb) {
+		child = kdbus_node_from_rb(rb);
+		ret = kdbus_node_name_compare(hash, name, child);
+		if (ret < 0)
+			rb = rb->rb_left;
+		else if (ret > 0)
+			rb = rb->rb_right;
+		else
+			break;
+	}
+	if (rb && kdbus_node_acquire(child))
+		kdbus_node_ref(child);
+	else
+		child = NULL;
+	mutex_unlock(&node->lock);
+
+	return child;
+}
+
+static struct kdbus_node *node_find_closest_unlocked(struct kdbus_node *node,
+						     unsigned int hash,
+						     const char *name)
+{
+	struct kdbus_node *n, *pos = NULL;
+	struct rb_node *rb;
+	int res;
+
+	/*
+	 * Find the closest child with ``node->hash >= hash'', or, if @name is
+	 * valid, ``node->name >= name'' (where '>=' is the lex. order).
+	 */
+
+	rb = node->children.rb_node;
+	while (rb) {
+		n = kdbus_node_from_rb(rb);
+
+		if (name)
+			res = kdbus_node_name_compare(hash, name, n);
+		else
+			res = hash - n->hash;
+
+		if (res <= 0) {
+			rb = rb->rb_left;
+			pos = n;
+		} else { /* ``hash > n->hash'', ``name > n->name'' */
+			rb = rb->rb_right;
+		}
+	}
+
+	return pos;
+}
+
+/**
+ * kdbus_node_find_closest() - Find closest child-match
+ * @node:	parent node to search through
+ * @hash:	hash value to find closest match for
+ *
+ * Find the closest child of @node with a hash greater than or equal to @hash.
+ * The closest match is the left-most child of @node with this property. Which
+ * means, it is the first child with that hash returned by
+ * kdbus_node_next_child(), if you'd iterate the whole parent node.
+ *
+ * Return: Reference to acquired child, or NULL if none found.
+ */
+struct kdbus_node *kdbus_node_find_closest(struct kdbus_node *node,
+					   unsigned int hash)
+{
+	struct kdbus_node *child;
+	struct rb_node *rb;
+
+	mutex_lock(&node->lock);
+
+	child = node_find_closest_unlocked(node, hash, NULL);
+	while (child && !kdbus_node_acquire(child)) {
+		rb = rb_next(&child->rb);
+		if (rb)
+			child = kdbus_node_from_rb(rb);
+		else
+			child = NULL;
+	}
+	kdbus_node_ref(child);
+
+	mutex_unlock(&node->lock);
+
+	return child;
+}
+
+/**
+ * kdbus_node_next_child() - Acquire next child
+ * @node:	parent node
+ * @prev:	previous child-node position or NULL
+ *
+ * This function returns a reference to the next active child of @node, after
+ * the passed position @prev. If @prev is NULL, a reference to the first active
+ * child is returned. If no more active children are found, NULL is returned.
+ *
+ * This function acquires the next child it returns. If you're done with the
+ * returned pointer, you need to release _and_ unref it.
+ *
+ * The passed in pointer @prev is not modified by this function, and it does
+ * *not* have to be active. If @prev was acquired via different means, or if it
+ * was unlinked from its parent before you pass it in, then this iterator will
+ * still return the next active child (it will have to search through the
+ * rb-tree based on the node-name, though).
+ * However, @prev must not be linked to a different parent than @node!
+ *
+ * Return: Reference to next acquired child, or NULL if at the end.
+ */
+struct kdbus_node *kdbus_node_next_child(struct kdbus_node *node,
+					 struct kdbus_node *prev)
+{
+	struct kdbus_node *pos = NULL;
+	struct rb_node *rb;
+
+	mutex_lock(&node->lock);
+
+	if (!prev) {
+		/*
+		 * New iteration; find first node in rb-tree and try to acquire
+		 * it. If we got it, directly return it as first element.
+		 * Otherwise, the loop below will find the next active node.
+		 */
+		rb = rb_first(&node->children);
+		if (!rb)
+			goto exit;
+		pos = kdbus_node_from_rb(rb);
+		if (kdbus_node_acquire(pos))
+			goto exit;
+	} else if (RB_EMPTY_NODE(&prev->rb)) {
+		/*
+		 * The current iterator is no longer linked to the rb-tree. Use
+		 * its hash value and name to find the next _higher_ node and
+		 * acquire it. If we got it, return it as next element.
+		 * Otherwise, the loop below will find the next active node.
+		 */
+		pos = node_find_closest_unlocked(node, prev->hash, prev->name);
+		if (!pos)
+			goto exit;
+		if (kdbus_node_acquire(pos))
+			goto exit;
+	} else {
+		/*
+		 * The current iterator is still linked to the parent. Set it
+		 * as current position and use the loop below to find the next
+		 * active element.
+		 */
+		pos = prev;
+	}
+
+	/* @pos was already returned or is inactive; find next active node */
+	do {
+		rb = rb_next(&pos->rb);
+		if (rb)
+			pos = kdbus_node_from_rb(rb);
+		else
+			pos = NULL;
+	} while (pos && !kdbus_node_acquire(pos));
+
+exit:
+	/* @pos is NULL or acquired. Take ref if non-NULL and return it */
+	kdbus_node_ref(pos);
+	mutex_unlock(&node->lock);
+	return pos;
+}
diff --git a/ipc/kdbus/node.h b/ipc/kdbus/node.h
new file mode 100644
index 000000000..970e02b08
--- /dev/null
+++ b/ipc/kdbus/node.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_NODE_H
+#define __KDBUS_NODE_H
+
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+
+struct kdbus_node;
+
+enum kdbus_node_type {
+	KDBUS_NODE_DOMAIN,
+	KDBUS_NODE_CONTROL,
+	KDBUS_NODE_BUS,
+	KDBUS_NODE_ENDPOINT,
+};
+
+typedef void (*kdbus_node_free_t) (struct kdbus_node *node);
+typedef void (*kdbus_node_release_t) (struct kdbus_node *node, bool was_active);
+
+struct kdbus_node {
+	atomic_t refcnt;
+	atomic_t active;
+	wait_queue_head_t waitq;
+
+	/* static members */
+	unsigned int type;
+	kdbus_node_free_t free_cb;
+	kdbus_node_release_t release_cb;
+	umode_t mode;
+	kuid_t uid;
+	kgid_t gid;
+
+	/* valid once linked */
+	char *name;
+	unsigned int hash;
+	unsigned int id;
+	struct kdbus_node *parent; /* may be NULL */
+
+	/* valid iff active */
+	struct mutex lock;
+	struct rb_node rb;
+	struct rb_root children;
+};
+
+#define kdbus_node_from_rb(_node) rb_entry((_node), struct kdbus_node, rb)
+
+extern struct ida kdbus_node_ida;
+
+void kdbus_node_init(struct kdbus_node *node, unsigned int type);
+
+int kdbus_node_link(struct kdbus_node *node, struct kdbus_node *parent,
+		    const char *name);
+
+struct kdbus_node *kdbus_node_ref(struct kdbus_node *node);
+struct kdbus_node *kdbus_node_unref(struct kdbus_node *node);
+
+bool kdbus_node_is_active(struct kdbus_node *node);
+bool kdbus_node_is_deactivated(struct kdbus_node *node);
+bool kdbus_node_activate(struct kdbus_node *node);
+void kdbus_node_deactivate(struct kdbus_node *node);
+
+bool kdbus_node_acquire(struct kdbus_node *node);
+void kdbus_node_release(struct kdbus_node *node);
+
+struct kdbus_node *kdbus_node_find_child(struct kdbus_node *node,
+					 const char *name);
+struct kdbus_node *kdbus_node_find_closest(struct kdbus_node *node,
+					   unsigned int hash);
+struct kdbus_node *kdbus_node_next_child(struct kdbus_node *node,
+					 struct kdbus_node *prev);
+
+#endif
diff --git a/ipc/kdbus/notify.c b/ipc/kdbus/notify.c
new file mode 100644
index 000000000..375758c48
--- /dev/null
+++ b/ipc/kdbus/notify.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "domain.h"
+#include "endpoint.h"
+#include "item.h"
+#include "message.h"
+#include "notify.h"
+
+static inline void kdbus_notify_add_tail(struct kdbus_staging *staging,
+					 struct kdbus_bus *bus)
+{
+	spin_lock(&bus->notify_lock);
+	list_add_tail(&staging->notify_entry, &bus->notify_list);
+	spin_unlock(&bus->notify_lock);
+}
+
+static int kdbus_notify_reply(struct kdbus_bus *bus, u64 id,
+			      u64 cookie, u64 msg_type)
+{
+	struct kdbus_staging *s;
+
+	s = kdbus_staging_new_kernel(bus, id, cookie, 0, msg_type);
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	kdbus_notify_add_tail(s, bus);
+	return 0;
+}
+
+/**
+ * kdbus_notify_reply_timeout() - queue a timeout reply
+ * @bus:		Bus which queues the messages
+ * @id:			The destination's connection ID
+ * @cookie:		The cookie to set in the reply.
+ *
+ * Queues a message that has a KDBUS_ITEM_REPLY_TIMEOUT item attached.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_notify_reply_timeout(struct kdbus_bus *bus, u64 id, u64 cookie)
+{
+	return kdbus_notify_reply(bus, id, cookie, KDBUS_ITEM_REPLY_TIMEOUT);
+}
+
+/**
+ * kdbus_notify_reply_dead() - queue a 'dead' reply
+ * @bus:		Bus which queues the messages
+ * @id:			The destination's connection ID
+ * @cookie:		The cookie to set in the reply.
+ *
+ * Queues a message that has a KDBUS_ITEM_REPLY_DEAD item attached.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_notify_reply_dead(struct kdbus_bus *bus, u64 id, u64 cookie)
+{
+	return kdbus_notify_reply(bus, id, cookie, KDBUS_ITEM_REPLY_DEAD);
+}
+
+/**
+ * kdbus_notify_name_change() - queue a notification about a name owner change
+ * @bus:		Bus which queues the messages
+ * @type:		The type if the notification; KDBUS_ITEM_NAME_ADD,
+ *			KDBUS_ITEM_NAME_CHANGE or KDBUS_ITEM_NAME_REMOVE
+ * @old_id:		The id of the connection that used to own the name
+ * @new_id:		The id of the new owner connection
+ * @old_flags:		The flags to pass in the KDBUS_ITEM flags field for
+ *			the old owner
+ * @new_flags:		The flags to pass in the KDBUS_ITEM flags field for
+ *			the new owner
+ * @name:		The name that was removed or assigned to a new owner
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_notify_name_change(struct kdbus_bus *bus, u64 type,
+			     u64 old_id, u64 new_id,
+			     u64 old_flags, u64 new_flags,
+			     const char *name)
+{
+	size_t name_len, extra_size;
+	struct kdbus_staging *s;
+
+	name_len = strlen(name) + 1;
+	extra_size = sizeof(struct kdbus_notify_name_change) + name_len;
+
+	s = kdbus_staging_new_kernel(bus, KDBUS_DST_ID_BROADCAST, 0,
+				     extra_size, type);
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	s->notify->name_change.old_id.id = old_id;
+	s->notify->name_change.old_id.flags = old_flags;
+	s->notify->name_change.new_id.id = new_id;
+	s->notify->name_change.new_id.flags = new_flags;
+	memcpy(s->notify->name_change.name, name, name_len);
+
+	kdbus_notify_add_tail(s, bus);
+	return 0;
+}
+
+/**
+ * kdbus_notify_id_change() - queue a notification about a unique ID change
+ * @bus:		Bus which queues the messages
+ * @type:		The type if the notification; KDBUS_ITEM_ID_ADD or
+ *			KDBUS_ITEM_ID_REMOVE
+ * @id:			The id of the connection that was added or removed
+ * @flags:		The flags to pass in the KDBUS_ITEM flags field
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_notify_id_change(struct kdbus_bus *bus, u64 type, u64 id, u64 flags)
+{
+	struct kdbus_staging *s;
+	size_t extra_size;
+
+	extra_size = sizeof(struct kdbus_notify_id_change);
+	s = kdbus_staging_new_kernel(bus, KDBUS_DST_ID_BROADCAST, 0,
+				     extra_size, type);
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	s->notify->id_change.id = id;
+	s->notify->id_change.flags = flags;
+
+	kdbus_notify_add_tail(s, bus);
+	return 0;
+}
+
+/**
+ * kdbus_notify_flush() - send a list of collected messages
+ * @bus:		Bus which queues the messages
+ *
+ * The list is empty after sending the messages.
+ */
+void kdbus_notify_flush(struct kdbus_bus *bus)
+{
+	LIST_HEAD(notify_list);
+	struct kdbus_staging *s, *tmp;
+
+	mutex_lock(&bus->notify_flush_lock);
+	down_read(&bus->name_registry->rwlock);
+
+	spin_lock(&bus->notify_lock);
+	list_splice_init(&bus->notify_list, &notify_list);
+	spin_unlock(&bus->notify_lock);
+
+	list_for_each_entry_safe(s, tmp, &notify_list, notify_entry) {
+		if (s->msg->dst_id != KDBUS_DST_ID_BROADCAST) {
+			struct kdbus_conn *conn;
+
+			conn = kdbus_bus_find_conn_by_id(bus, s->msg->dst_id);
+			if (conn) {
+				kdbus_bus_eavesdrop(bus, NULL, s);
+				kdbus_conn_entry_insert(NULL, conn, s, NULL,
+							NULL);
+				kdbus_conn_unref(conn);
+			}
+		} else {
+			kdbus_bus_broadcast(bus, NULL, s);
+		}
+
+		list_del(&s->notify_entry);
+		kdbus_staging_free(s);
+	}
+
+	up_read(&bus->name_registry->rwlock);
+	mutex_unlock(&bus->notify_flush_lock);
+}
+
+/**
+ * kdbus_notify_free() - free a list of collected messages
+ * @bus:		Bus which queues the messages
+ */
+void kdbus_notify_free(struct kdbus_bus *bus)
+{
+	struct kdbus_staging *s, *tmp;
+
+	list_for_each_entry_safe(s, tmp, &bus->notify_list, notify_entry) {
+		list_del(&s->notify_entry);
+		kdbus_staging_free(s);
+	}
+}
diff --git a/ipc/kdbus/notify.h b/ipc/kdbus/notify.h
new file mode 100644
index 000000000..03df464cb
--- /dev/null
+++ b/ipc/kdbus/notify.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_NOTIFY_H
+#define __KDBUS_NOTIFY_H
+
+struct kdbus_bus;
+
+int kdbus_notify_id_change(struct kdbus_bus *bus, u64 type, u64 id, u64 flags);
+int kdbus_notify_reply_timeout(struct kdbus_bus *bus, u64 id, u64 cookie);
+int kdbus_notify_reply_dead(struct kdbus_bus *bus, u64 id, u64 cookie);
+int kdbus_notify_name_change(struct kdbus_bus *bus, u64 type,
+			     u64 old_id, u64 new_id,
+			     u64 old_flags, u64 new_flags,
+			     const char *name);
+void kdbus_notify_flush(struct kdbus_bus *bus);
+void kdbus_notify_free(struct kdbus_bus *bus);
+
+#endif
diff --git a/ipc/kdbus/policy.c b/ipc/kdbus/policy.c
new file mode 100644
index 000000000..f2618e15e
--- /dev/null
+++ b/ipc/kdbus/policy.c
@@ -0,0 +1,489 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "domain.h"
+#include "item.h"
+#include "names.h"
+#include "policy.h"
+
+#define KDBUS_POLICY_HASH_SIZE	64
+
+/**
+ * struct kdbus_policy_db_entry_access - a database entry access item
+ * @type:		One of KDBUS_POLICY_ACCESS_* types
+ * @access:		Access to grant. One of KDBUS_POLICY_*
+ * @uid:		For KDBUS_POLICY_ACCESS_USER, the global uid
+ * @gid:		For KDBUS_POLICY_ACCESS_GROUP, the global gid
+ * @list:		List entry item for the entry's list
+ *
+ * This is the internal version of struct kdbus_policy_db_access.
+ */
+struct kdbus_policy_db_entry_access {
+	u8 type;		/* USER, GROUP, WORLD */
+	u8 access;		/* OWN, TALK, SEE */
+	union {
+		kuid_t uid;	/* global uid */
+		kgid_t gid;	/* global gid */
+	};
+	struct list_head list;
+};
+
+/**
+ * struct kdbus_policy_db_entry - a policy database entry
+ * @name:		The name to match the policy entry against
+ * @hentry:		The hash entry for the database's entries_hash
+ * @access_list:	List head for keeping tracks of the entry's
+ *			access items.
+ * @owner:		The owner of this entry. Can be a kdbus_conn or
+ *			a kdbus_ep object.
+ * @wildcard:		The name is a wildcard, such as ending on '.*'
+ */
+struct kdbus_policy_db_entry {
+	char *name;
+	struct hlist_node hentry;
+	struct list_head access_list;
+	const void *owner;
+	bool wildcard:1;
+};
+
+static void kdbus_policy_entry_free(struct kdbus_policy_db_entry *e)
+{
+	struct kdbus_policy_db_entry_access *a, *tmp;
+
+	list_for_each_entry_safe(a, tmp, &e->access_list, list) {
+		list_del(&a->list);
+		kfree(a);
+	}
+
+	kfree(e->name);
+	kfree(e);
+}
+
+static unsigned int kdbus_strnhash(const char *str, size_t len)
+{
+	unsigned long hash = init_name_hash();
+
+	while (len--)
+		hash = partial_name_hash(*str++, hash);
+
+	return end_name_hash(hash);
+}
+
+static const struct kdbus_policy_db_entry *
+kdbus_policy_lookup(struct kdbus_policy_db *db, const char *name, u32 hash)
+{
+	struct kdbus_policy_db_entry *e;
+	const char *dot;
+	size_t len;
+
+	/* find exact match */
+	hash_for_each_possible(db->entries_hash, e, hentry, hash)
+		if (strcmp(e->name, name) == 0 && !e->wildcard)
+			return e;
+
+	/* find wildcard match */
+
+	dot = strrchr(name, '.');
+	if (!dot)
+		return NULL;
+
+	len = dot - name;
+	hash = kdbus_strnhash(name, len);
+
+	hash_for_each_possible(db->entries_hash, e, hentry, hash)
+		if (e->wildcard && !strncmp(e->name, name, len) &&
+		    !e->name[len])
+			return e;
+
+	return NULL;
+}
+
+/**
+ * kdbus_policy_db_clear - release all memory from a policy db
+ * @db:		The policy database
+ */
+void kdbus_policy_db_clear(struct kdbus_policy_db *db)
+{
+	struct kdbus_policy_db_entry *e;
+	struct hlist_node *tmp;
+	unsigned int i;
+
+	/* purge entries */
+	down_write(&db->entries_rwlock);
+	hash_for_each_safe(db->entries_hash, i, tmp, e, hentry) {
+		hash_del(&e->hentry);
+		kdbus_policy_entry_free(e);
+	}
+	up_write(&db->entries_rwlock);
+}
+
+/**
+ * kdbus_policy_db_init() - initialize a new policy database
+ * @db:		The location of the database
+ *
+ * This initializes a new policy-db. The underlying memory must have been
+ * cleared to zero by the caller.
+ */
+void kdbus_policy_db_init(struct kdbus_policy_db *db)
+{
+	hash_init(db->entries_hash);
+	init_rwsem(&db->entries_rwlock);
+}
+
+/**
+ * kdbus_policy_query_unlocked() - Query the policy database
+ * @db:		Policy database
+ * @cred:	Credentials to test against
+ * @name:	Name to query
+ * @hash:	Hash value of @name
+ *
+ * Same as kdbus_policy_query() but requires the caller to lock the policy
+ * database against concurrent writes.
+ *
+ * Return: The highest KDBUS_POLICY_* access type found, or -EPERM if none.
+ */
+int kdbus_policy_query_unlocked(struct kdbus_policy_db *db,
+				const struct cred *cred, const char *name,
+				unsigned int hash)
+{
+	struct kdbus_policy_db_entry_access *a;
+	const struct kdbus_policy_db_entry *e;
+	int i, highest = -EPERM;
+
+	e = kdbus_policy_lookup(db, name, hash);
+	if (!e)
+		return -EPERM;
+
+	list_for_each_entry(a, &e->access_list, list) {
+		if ((int)a->access <= highest)
+			continue;
+
+		switch (a->type) {
+		case KDBUS_POLICY_ACCESS_USER:
+			if (uid_eq(cred->euid, a->uid))
+				highest = a->access;
+			break;
+		case KDBUS_POLICY_ACCESS_GROUP:
+			if (gid_eq(cred->egid, a->gid)) {
+				highest = a->access;
+				break;
+			}
+
+			for (i = 0; i < cred->group_info->ngroups; i++) {
+				kgid_t gid = GROUP_AT(cred->group_info, i);
+
+				if (gid_eq(gid, a->gid)) {
+					highest = a->access;
+					break;
+				}
+			}
+
+			break;
+		case KDBUS_POLICY_ACCESS_WORLD:
+			highest = a->access;
+			break;
+		}
+
+		/* OWN is the highest possible policy */
+		if (highest >= KDBUS_POLICY_OWN)
+			break;
+	}
+
+	return highest;
+}
+
+/**
+ * kdbus_policy_query() - Query the policy database
+ * @db:		Policy database
+ * @cred:	Credentials to test against
+ * @name:	Name to query
+ * @hash:	Hash value of @name
+ *
+ * Query the policy database @db for the access rights of @cred to the name
+ * @name. The access rights of @cred are returned, or -EPERM if no access is
+ * granted.
+ *
+ * This call effectively searches for the highest access-right granted to
+ * @cred. The caller should really cache those as policy lookups are rather
+ * expensive.
+ *
+ * Return: The highest KDBUS_POLICY_* access type found, or -EPERM if none.
+ */
+int kdbus_policy_query(struct kdbus_policy_db *db, const struct cred *cred,
+		       const char *name, unsigned int hash)
+{
+	int ret;
+
+	down_read(&db->entries_rwlock);
+	ret = kdbus_policy_query_unlocked(db, cred, name, hash);
+	up_read(&db->entries_rwlock);
+
+	return ret;
+}
+
+static void __kdbus_policy_remove_owner(struct kdbus_policy_db *db,
+					const void *owner)
+{
+	struct kdbus_policy_db_entry *e;
+	struct hlist_node *tmp;
+	int i;
+
+	hash_for_each_safe(db->entries_hash, i, tmp, e, hentry)
+		if (e->owner == owner) {
+			hash_del(&e->hentry);
+			kdbus_policy_entry_free(e);
+		}
+}
+
+/**
+ * kdbus_policy_remove_owner() - remove all entries related to a connection
+ * @db:		The policy database
+ * @owner:	The connection which items to remove
+ */
+void kdbus_policy_remove_owner(struct kdbus_policy_db *db,
+			       const void *owner)
+{
+	down_write(&db->entries_rwlock);
+	__kdbus_policy_remove_owner(db, owner);
+	up_write(&db->entries_rwlock);
+}
+
+/*
+ * Convert user provided policy access to internal kdbus policy
+ * access
+ */
+static struct kdbus_policy_db_entry_access *
+kdbus_policy_make_access(const struct kdbus_policy_access *uaccess)
+{
+	int ret;
+	struct kdbus_policy_db_entry_access *a;
+
+	a = kzalloc(sizeof(*a), GFP_KERNEL);
+	if (!a)
+		return ERR_PTR(-ENOMEM);
+
+	ret = -EINVAL;
+	switch (uaccess->access) {
+	case KDBUS_POLICY_SEE:
+	case KDBUS_POLICY_TALK:
+	case KDBUS_POLICY_OWN:
+		a->access = uaccess->access;
+		break;
+	default:
+		goto err;
+	}
+
+	switch (uaccess->type) {
+	case KDBUS_POLICY_ACCESS_USER:
+		a->uid = make_kuid(current_user_ns(), uaccess->id);
+		if (!uid_valid(a->uid))
+			goto err;
+
+		break;
+	case KDBUS_POLICY_ACCESS_GROUP:
+		a->gid = make_kgid(current_user_ns(), uaccess->id);
+		if (!gid_valid(a->gid))
+			goto err;
+
+		break;
+	case KDBUS_POLICY_ACCESS_WORLD:
+		break;
+	default:
+		goto err;
+	}
+
+	a->type = uaccess->type;
+
+	return a;
+
+err:
+	kfree(a);
+	return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_policy_set() - set a connection's policy rules
+ * @db:				The policy database
+ * @items:			A list of kdbus_item elements that contain both
+ *				names and access rules to set.
+ * @items_size:			The total size of the items.
+ * @max_policies:		The maximum number of policy entries to allow.
+ *				Pass 0 for no limit.
+ * @allow_wildcards:		Boolean value whether wildcard entries (such
+ *				ending on '.*') should be allowed.
+ * @owner:			The owner of the new policy items.
+ *
+ * This function sets a new set of policies for a given owner. The names and
+ * access rules are gathered by walking the list of items passed in as
+ * argument. An item of type KDBUS_ITEM_NAME is expected before any number of
+ * KDBUS_ITEM_POLICY_ACCESS items. If there are more repetitions of this
+ * pattern than denoted in @max_policies, -EINVAL is returned.
+ *
+ * In order to allow atomic replacement of rules, the function first removes
+ * all entries that have been created for the given owner previously.
+ *
+ * Callers to this function must make sure that the owner is a custom
+ * endpoint, or if the endpoint is a default endpoint, then it must be
+ * either a policy holder or an activator.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int kdbus_policy_set(struct kdbus_policy_db *db,
+		     const struct kdbus_item *items,
+		     size_t items_size,
+		     size_t max_policies,
+		     bool allow_wildcards,
+		     const void *owner)
+{
+	struct kdbus_policy_db_entry_access *a;
+	struct kdbus_policy_db_entry *e, *p;
+	const struct kdbus_item *item;
+	struct hlist_node *tmp;
+	HLIST_HEAD(entries);
+	HLIST_HEAD(restore);
+	size_t count = 0;
+	int i, ret = 0;
+	u32 hash;
+
+	/* Walk the list of items and look for new policies */
+	e = NULL;
+	KDBUS_ITEMS_FOREACH(item, items, items_size) {
+		switch (item->type) {
+		case KDBUS_ITEM_NAME: {
+			size_t len;
+
+			if (max_policies && ++count > max_policies) {
+				ret = -E2BIG;
+				goto exit;
+			}
+
+			if (!kdbus_name_is_valid(item->str, true)) {
+				ret = -EINVAL;
+				goto exit;
+			}
+
+			e = kzalloc(sizeof(*e), GFP_KERNEL);
+			if (!e) {
+				ret = -ENOMEM;
+				goto exit;
+			}
+
+			INIT_LIST_HEAD(&e->access_list);
+			e->owner = owner;
+			hlist_add_head(&e->hentry, &entries);
+
+			e->name = kstrdup(item->str, GFP_KERNEL);
+			if (!e->name) {
+				ret = -ENOMEM;
+				goto exit;
+			}
+
+			/*
+			 * If a supplied name ends with an '.*', cut off that
+			 * part, only store anything before it, and mark the
+			 * entry as wildcard.
+			 */
+			len = strlen(e->name);
+			if (len > 2 &&
+			    e->name[len - 3] == '.' &&
+			    e->name[len - 2] == '*') {
+				if (!allow_wildcards) {
+					ret = -EINVAL;
+					goto exit;
+				}
+
+				e->name[len - 3] = '\0';
+				e->wildcard = true;
+			}
+
+			break;
+		}
+
+		case KDBUS_ITEM_POLICY_ACCESS:
+			if (!e) {
+				ret = -EINVAL;
+				goto exit;
+			}
+
+			a = kdbus_policy_make_access(&item->policy_access);
+			if (IS_ERR(a)) {
+				ret = PTR_ERR(a);
+				goto exit;
+			}
+
+			list_add_tail(&a->list, &e->access_list);
+			break;
+		}
+	}
+
+	down_write(&db->entries_rwlock);
+
+	/* remember previous entries to restore in case of failure */
+	hash_for_each_safe(db->entries_hash, i, tmp, e, hentry)
+		if (e->owner == owner) {
+			hash_del(&e->hentry);
+			hlist_add_head(&e->hentry, &restore);
+		}
+
+	hlist_for_each_entry_safe(e, tmp, &entries, hentry) {
+		/* prevent duplicates */
+		hash = kdbus_strhash(e->name);
+		hash_for_each_possible(db->entries_hash, p, hentry, hash)
+			if (strcmp(e->name, p->name) == 0 &&
+			    e->wildcard == p->wildcard) {
+				ret = -EEXIST;
+				goto restore;
+			}
+
+		hlist_del(&e->hentry);
+		hash_add(db->entries_hash, &e->hentry, hash);
+	}
+
+restore:
+	/* if we failed, flush all entries we added so far */
+	if (ret < 0)
+		__kdbus_policy_remove_owner(db, owner);
+
+	/* if we failed, restore entries, otherwise release them */
+	hlist_for_each_entry_safe(e, tmp, &restore, hentry) {
+		hlist_del(&e->hentry);
+		if (ret < 0) {
+			hash = kdbus_strhash(e->name);
+			hash_add(db->entries_hash, &e->hentry, hash);
+		} else {
+			kdbus_policy_entry_free(e);
+		}
+	}
+
+	up_write(&db->entries_rwlock);
+
+exit:
+	hlist_for_each_entry_safe(e, tmp, &entries, hentry) {
+		hlist_del(&e->hentry);
+		kdbus_policy_entry_free(e);
+	}
+
+	return ret;
+}
diff --git a/ipc/kdbus/policy.h b/ipc/kdbus/policy.h
new file mode 100644
index 000000000..15dd7bc12
--- /dev/null
+++ b/ipc/kdbus/policy.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_POLICY_H
+#define __KDBUS_POLICY_H
+
+#include <linux/hashtable.h>
+#include <linux/rwsem.h>
+
+struct kdbus_conn;
+struct kdbus_item;
+
+/**
+ * struct kdbus_policy_db - policy database
+ * @entries_hash:	Hashtable of entries
+ * @entries_rwlock:	Mutex to protect the database's access entries
+ */
+struct kdbus_policy_db {
+	DECLARE_HASHTABLE(entries_hash, 6);
+	struct rw_semaphore entries_rwlock;
+};
+
+void kdbus_policy_db_init(struct kdbus_policy_db *db);
+void kdbus_policy_db_clear(struct kdbus_policy_db *db);
+
+int kdbus_policy_query_unlocked(struct kdbus_policy_db *db,
+				const struct cred *cred, const char *name,
+				unsigned int hash);
+int kdbus_policy_query(struct kdbus_policy_db *db, const struct cred *cred,
+		       const char *name, unsigned int hash);
+
+void kdbus_policy_remove_owner(struct kdbus_policy_db *db,
+			       const void *owner);
+int kdbus_policy_set(struct kdbus_policy_db *db,
+		     const struct kdbus_item *items,
+		     size_t items_size,
+		     size_t max_policies,
+		     bool allow_wildcards,
+		     const void *owner);
+
+#endif
diff --git a/ipc/kdbus/pool.c b/ipc/kdbus/pool.c
new file mode 100644
index 000000000..63ccd5571
--- /dev/null
+++ b/ipc/kdbus/pool.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/aio.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/shmem_fs.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+
+#include "pool.h"
+#include "util.h"
+
+/**
+ * struct kdbus_pool - the receiver's buffer
+ * @f:			The backing shmem file
+ * @size:		The size of the file
+ * @accounted_size:	Currently accounted memory in bytes
+ * @lock:		Pool data lock
+ * @slices:		All slices sorted by address
+ * @slices_busy:	Tree of allocated slices
+ * @slices_free:	Tree of free slices
+ *
+ * The receiver's buffer, managed as a pool of allocated and free
+ * slices containing the queued messages.
+ *
+ * Messages sent with KDBUS_CMD_SEND are copied directly by the
+ * sending process into the receiver's pool.
+ *
+ * Messages received with KDBUS_CMD_RECV just return the offset
+ * to the data placed in the pool.
+ *
+ * The internally allocated memory needs to be returned by the receiver
+ * with KDBUS_CMD_FREE.
+ */
+struct kdbus_pool {
+	struct file *f;
+	size_t size;
+	size_t accounted_size;
+	struct mutex lock;
+
+	struct list_head slices;
+	struct rb_root slices_busy;
+	struct rb_root slices_free;
+};
+
+/**
+ * struct kdbus_pool_slice - allocated element in kdbus_pool
+ * @pool:		Pool this slice belongs to
+ * @off:		Offset of slice in the shmem file
+ * @size:		Size of slice
+ * @entry:		Entry in "all slices" list
+ * @rb_node:		Entry in free or busy list
+ * @free:		Unused slice
+ * @accounted:		Accounted as queue slice
+ * @ref_kernel:		Kernel holds a reference
+ * @ref_user:		Userspace holds a reference
+ *
+ * The pool has one or more slices, always spanning the entire size of the
+ * pool.
+ *
+ * Every slice is an element in a list sorted by the buffer address, to
+ * provide access to the next neighbor slice.
+ *
+ * Every slice is member in either the busy or the free tree. The free
+ * tree is organized by slice size, the busy tree organized by buffer
+ * offset.
+ */
+struct kdbus_pool_slice {
+	struct kdbus_pool *pool;
+	size_t off;
+	size_t size;
+
+	struct list_head entry;
+	struct rb_node rb_node;
+
+	bool free:1;
+	bool accounted:1;
+	bool ref_kernel:1;
+	bool ref_user:1;
+};
+
+static struct kdbus_pool_slice *kdbus_pool_slice_new(struct kdbus_pool *pool,
+						     size_t off, size_t size)
+{
+	struct kdbus_pool_slice *slice;
+
+	slice = kzalloc(sizeof(*slice), GFP_KERNEL);
+	if (!slice)
+		return NULL;
+
+	slice->pool = pool;
+	slice->off = off;
+	slice->size = size;
+	slice->free = true;
+	return slice;
+}
+
+/* insert a slice into the free tree */
+static void kdbus_pool_add_free_slice(struct kdbus_pool *pool,
+				      struct kdbus_pool_slice *slice)
+{
+	struct rb_node **n;
+	struct rb_node *pn = NULL;
+
+	n = &pool->slices_free.rb_node;
+	while (*n) {
+		struct kdbus_pool_slice *pslice;
+
+		pn = *n;
+		pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node);
+		if (slice->size < pslice->size)
+			n = &pn->rb_left;
+		else
+			n = &pn->rb_right;
+	}
+
+	rb_link_node(&slice->rb_node, pn, n);
+	rb_insert_color(&slice->rb_node, &pool->slices_free);
+}
+
+/* insert a slice into the busy tree */
+static void kdbus_pool_add_busy_slice(struct kdbus_pool *pool,
+				      struct kdbus_pool_slice *slice)
+{
+	struct rb_node **n;
+	struct rb_node *pn = NULL;
+
+	n = &pool->slices_busy.rb_node;
+	while (*n) {
+		struct kdbus_pool_slice *pslice;
+
+		pn = *n;
+		pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node);
+		if (slice->off < pslice->off)
+			n = &pn->rb_left;
+		else if (slice->off > pslice->off)
+			n = &pn->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&slice->rb_node, pn, n);
+	rb_insert_color(&slice->rb_node, &pool->slices_busy);
+}
+
+static struct kdbus_pool_slice *kdbus_pool_find_slice(struct kdbus_pool *pool,
+						      size_t off)
+{
+	struct rb_node *n;
+
+	n = pool->slices_busy.rb_node;
+	while (n) {
+		struct kdbus_pool_slice *s;
+
+		s = rb_entry(n, struct kdbus_pool_slice, rb_node);
+		if (off < s->off)
+			n = n->rb_left;
+		else if (off > s->off)
+			n = n->rb_right;
+		else
+			return s;
+	}
+
+	return NULL;
+}
+
+/**
+ * kdbus_pool_slice_alloc() - allocate memory from a pool
+ * @pool:	The receiver's pool
+ * @size:	The number of bytes to allocate
+ * @accounted:	Whether this slice should be accounted for
+ *
+ * The returned slice is used for kdbus_pool_slice_release() to
+ * free the allocated memory. If either @kvec or @iovec is non-NULL, the data
+ * will be copied from kernel or userspace memory into the new slice at
+ * offset 0.
+ *
+ * Return: the allocated slice on success, ERR_PTR on failure.
+ */
+struct kdbus_pool_slice *kdbus_pool_slice_alloc(struct kdbus_pool *pool,
+						size_t size, bool accounted)
+{
+	size_t slice_size = KDBUS_ALIGN8(size);
+	struct rb_node *n, *found = NULL;
+	struct kdbus_pool_slice *s;
+	int ret = 0;
+
+	if (WARN_ON(!size))
+		return ERR_PTR(-EINVAL);
+
+	/* search a free slice with the closest matching size */
+	mutex_lock(&pool->lock);
+	n = pool->slices_free.rb_node;
+	while (n) {
+		s = rb_entry(n, struct kdbus_pool_slice, rb_node);
+		if (slice_size < s->size) {
+			found = n;
+			n = n->rb_left;
+		} else if (slice_size > s->size) {
+			n = n->rb_right;
+		} else {
+			found = n;
+			break;
+		}
+	}
+
+	/* no slice with the minimum size found in the pool */
+	if (!found) {
+		ret = -EXFULL;
+		goto exit_unlock;
+	}
+
+	/* no exact match, use the closest one */
+	if (!n) {
+		struct kdbus_pool_slice *s_new;
+
+		s = rb_entry(found, struct kdbus_pool_slice, rb_node);
+
+		/* split-off the remainder of the size to its own slice */
+		s_new = kdbus_pool_slice_new(pool, s->off + slice_size,
+					     s->size - slice_size);
+		if (!s_new) {
+			ret = -ENOMEM;
+			goto exit_unlock;
+		}
+
+		list_add(&s_new->entry, &s->entry);
+		kdbus_pool_add_free_slice(pool, s_new);
+
+		/* adjust our size now that we split-off another slice */
+		s->size = slice_size;
+	}
+
+	/* move slice from free to the busy tree */
+	rb_erase(found, &pool->slices_free);
+	kdbus_pool_add_busy_slice(pool, s);
+
+	WARN_ON(s->ref_kernel || s->ref_user);
+
+	s->ref_kernel = true;
+	s->free = false;
+	s->accounted = accounted;
+	if (accounted)
+		pool->accounted_size += s->size;
+	mutex_unlock(&pool->lock);
+
+	return s;
+
+exit_unlock:
+	mutex_unlock(&pool->lock);
+	return ERR_PTR(ret);
+}
+
+static void __kdbus_pool_slice_release(struct kdbus_pool_slice *slice)
+{
+	struct kdbus_pool *pool = slice->pool;
+
+	/* don't free the slice if either has a reference */
+	if (slice->ref_kernel || slice->ref_user)
+		return;
+
+	if (WARN_ON(slice->free))
+		return;
+
+	rb_erase(&slice->rb_node, &pool->slices_busy);
+
+	/* merge with the next free slice */
+	if (!list_is_last(&slice->entry, &pool->slices)) {
+		struct kdbus_pool_slice *s;
+
+		s = list_entry(slice->entry.next,
+			       struct kdbus_pool_slice, entry);
+		if (s->free) {
+			rb_erase(&s->rb_node, &pool->slices_free);
+			list_del(&s->entry);
+			slice->size += s->size;
+			kfree(s);
+		}
+	}
+
+	/* merge with previous free slice */
+	if (pool->slices.next != &slice->entry) {
+		struct kdbus_pool_slice *s;
+
+		s = list_entry(slice->entry.prev,
+			       struct kdbus_pool_slice, entry);
+		if (s->free) {
+			rb_erase(&s->rb_node, &pool->slices_free);
+			list_del(&slice->entry);
+			s->size += slice->size;
+			kfree(slice);
+			slice = s;
+		}
+	}
+
+	slice->free = true;
+	kdbus_pool_add_free_slice(pool, slice);
+}
+
+/**
+ * kdbus_pool_slice_release() - drop kernel-reference on allocated slice
+ * @slice:		Slice allocated from the pool
+ *
+ * This releases the kernel-reference on the given slice. If the
+ * kernel-reference and the user-reference on a slice are dropped, the slice is
+ * returned to the pool.
+ *
+ * So far, we do not implement full ref-counting on slices. Each, kernel and
+ * user-space can have exactly one reference to a slice. If both are dropped at
+ * the same time, the slice is released.
+ */
+void kdbus_pool_slice_release(struct kdbus_pool_slice *slice)
+{
+	struct kdbus_pool *pool;
+
+	if (!slice)
+		return;
+
+	/* @slice may be freed, so keep local ptr to @pool */
+	pool = slice->pool;
+
+	mutex_lock(&pool->lock);
+	/* kernel must own a ref to @slice to drop it */
+	WARN_ON(!slice->ref_kernel);
+	slice->ref_kernel = false;
+	/* no longer kernel-owned, de-account slice */
+	if (slice->accounted && !WARN_ON(pool->accounted_size < slice->size))
+		pool->accounted_size -= slice->size;
+	__kdbus_pool_slice_release(slice);
+	mutex_unlock(&pool->lock);
+}
+
+/**
+ * kdbus_pool_release_offset() - release a public offset
+ * @pool:		pool to operate on
+ * @off:		offset to release
+ *
+ * This should be called whenever user-space frees a slice given to them. It
+ * verifies the slice is available and public, and then drops it. It ensures
+ * correct locking and barriers against queues.
+ *
+ * Return: 0 on success, ENXIO if the offset is invalid or not public.
+ */
+int kdbus_pool_release_offset(struct kdbus_pool *pool, size_t off)
+{
+	struct kdbus_pool_slice *slice;
+	int ret = 0;
+
+	/* 'pool->size' is used as dummy offset for empty slices */
+	if (off == pool->size)
+		return 0;
+
+	mutex_lock(&pool->lock);
+	slice = kdbus_pool_find_slice(pool, off);
+	if (slice && slice->ref_user) {
+		slice->ref_user = false;
+		__kdbus_pool_slice_release(slice);
+	} else {
+		ret = -ENXIO;
+	}
+	mutex_unlock(&pool->lock);
+
+	return ret;
+}
+
+/**
+ * kdbus_pool_publish_empty() - publish empty slice to user-space
+ * @pool:		pool to operate on
+ * @off:		output storage for offset, or NULL
+ * @size:		output storage for size, or NULL
+ *
+ * This is the same as kdbus_pool_slice_publish(), but uses a dummy slice with
+ * size 0. The returned offset points to the end of the pool and is never
+ * returned on real slices.
+ */
+void kdbus_pool_publish_empty(struct kdbus_pool *pool, u64 *off, u64 *size)
+{
+	if (off)
+		*off = pool->size;
+	if (size)
+		*size = 0;
+}
+
+/**
+ * kdbus_pool_slice_publish() - publish slice to user-space
+ * @slice:		The slice
+ * @out_offset:		Output storage for offset, or NULL
+ * @out_size:		Output storage for size, or NULL
+ *
+ * This prepares a slice to be published to user-space.
+ *
+ * This call combines the following operations:
+ *   * the memory region is flushed so the user's memory view is consistent
+ *   * the slice is marked as referenced by user-space, so user-space has to
+ *     call KDBUS_CMD_FREE to release it
+ *   * the offset and size of the slice are written to the given output
+ *     arguments, if non-NULL
+ */
+void kdbus_pool_slice_publish(struct kdbus_pool_slice *slice,
+			      u64 *out_offset, u64 *out_size)
+{
+	mutex_lock(&slice->pool->lock);
+	/* kernel must own a ref to @slice to gain a user-space ref */
+	WARN_ON(!slice->ref_kernel);
+	slice->ref_user = true;
+	mutex_unlock(&slice->pool->lock);
+
+	if (out_offset)
+		*out_offset = slice->off;
+	if (out_size)
+		*out_size = slice->size;
+}
+
+/**
+ * kdbus_pool_slice_offset() - Get a slice's offset inside the pool
+ * @slice:	Slice to return the offset of
+ *
+ * Return: The internal offset @slice inside the pool.
+ */
+off_t kdbus_pool_slice_offset(const struct kdbus_pool_slice *slice)
+{
+	return slice->off;
+}
+
+/**
+ * kdbus_pool_slice_size() - get size of a pool slice
+ * @slice:	slice to query
+ *
+ * Return: size of the given slice
+ */
+size_t kdbus_pool_slice_size(const struct kdbus_pool_slice *slice)
+{
+	return slice->size;
+}
+
+/**
+ * kdbus_pool_new() - create a new pool
+ * @name:		Name of the (deleted) file which shows up in
+ *			/proc, used for debugging
+ * @size:		Maximum size of the pool
+ *
+ * Return: a new kdbus_pool on success, ERR_PTR on failure.
+ */
+struct kdbus_pool *kdbus_pool_new(const char *name, size_t size)
+{
+	struct kdbus_pool_slice *s;
+	struct kdbus_pool *p;
+	struct file *f;
+	char *n = NULL;
+	int ret;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	if (name) {
+		n = kasprintf(GFP_KERNEL, KBUILD_MODNAME "-conn:%s", name);
+		if (!n) {
+			ret = -ENOMEM;
+			goto exit_free;
+		}
+	}
+
+	f = shmem_file_setup(n ?: KBUILD_MODNAME "-conn", size, 0);
+	kfree(n);
+
+	if (IS_ERR(f)) {
+		ret = PTR_ERR(f);
+		goto exit_free;
+	}
+
+	ret = get_write_access(file_inode(f));
+	if (ret < 0)
+		goto exit_put_shmem;
+
+	/* allocate first slice spanning the entire pool */
+	s = kdbus_pool_slice_new(p, 0, size);
+	if (!s) {
+		ret = -ENOMEM;
+		goto exit_put_write;
+	}
+
+	p->f = f;
+	p->size = size;
+	p->slices_free = RB_ROOT;
+	p->slices_busy = RB_ROOT;
+	mutex_init(&p->lock);
+
+	INIT_LIST_HEAD(&p->slices);
+	list_add(&s->entry, &p->slices);
+
+	kdbus_pool_add_free_slice(p, s);
+	return p;
+
+exit_put_write:
+	put_write_access(file_inode(f));
+exit_put_shmem:
+	fput(f);
+exit_free:
+	kfree(p);
+	return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_pool_free() - destroy pool
+ * @pool:		The receiver's pool
+ */
+void kdbus_pool_free(struct kdbus_pool *pool)
+{
+	struct kdbus_pool_slice *s, *tmp;
+
+	if (!pool)
+		return;
+
+	list_for_each_entry_safe(s, tmp, &pool->slices, entry) {
+		list_del(&s->entry);
+		kfree(s);
+	}
+
+	put_write_access(file_inode(pool->f));
+	fput(pool->f);
+	kfree(pool);
+}
+
+/**
+ * kdbus_pool_accounted() - retrieve accounting information
+ * @pool:		pool to query
+ * @size:		output for overall pool size
+ * @acc:		output for currently accounted size
+ *
+ * This returns accounting information of the pool. Note that the data might
+ * change after the function returns, as the pool lock is dropped. You need to
+ * protect the data via other means, if you need reliable accounting.
+ */
+void kdbus_pool_accounted(struct kdbus_pool *pool, size_t *size, size_t *acc)
+{
+	mutex_lock(&pool->lock);
+	if (size)
+		*size = pool->size;
+	if (acc)
+		*acc = pool->accounted_size;
+	mutex_unlock(&pool->lock);
+}
+
+/**
+ * kdbus_pool_slice_copy_iovec() - copy user memory to a slice
+ * @slice:		The slice to write to
+ * @off:		Offset in the slice to write to
+ * @iov:		iovec array, pointing to data to copy
+ * @iov_len:		Number of elements in @iov
+ * @total_len:		Total number of bytes described in members of @iov
+ *
+ * User memory referenced by @iov will be copied into @slice at offset @off.
+ *
+ * Return: the numbers of bytes copied, negative errno on failure.
+ */
+ssize_t
+kdbus_pool_slice_copy_iovec(const struct kdbus_pool_slice *slice, loff_t off,
+			    struct iovec *iov, size_t iov_len, size_t total_len)
+{
+	struct iov_iter iter;
+	ssize_t len;
+
+	if (WARN_ON(off + total_len > slice->size))
+		return -EFAULT;
+
+	off += slice->off;
+	iov_iter_init(&iter, WRITE, iov, iov_len, total_len);
+	len = vfs_iter_write(slice->pool->f, &iter, &off);
+
+	return (len >= 0 && len != total_len) ? -EFAULT : len;
+}
+
+/**
+ * kdbus_pool_slice_copy_kvec() - copy kernel memory to a slice
+ * @slice:		The slice to write to
+ * @off:		Offset in the slice to write to
+ * @kvec:		kvec array, pointing to data to copy
+ * @kvec_len:		Number of elements in @kvec
+ * @total_len:		Total number of bytes described in members of @kvec
+ *
+ * Kernel memory referenced by @kvec will be copied into @slice at offset @off.
+ *
+ * Return: the numbers of bytes copied, negative errno on failure.
+ */
+ssize_t kdbus_pool_slice_copy_kvec(const struct kdbus_pool_slice *slice,
+				   loff_t off, struct kvec *kvec,
+				   size_t kvec_len, size_t total_len)
+{
+	struct iov_iter iter;
+	mm_segment_t old_fs;
+	ssize_t len;
+
+	if (WARN_ON(off + total_len > slice->size))
+		return -EFAULT;
+
+	off += slice->off;
+	iov_iter_kvec(&iter, WRITE | ITER_KVEC, kvec, kvec_len, total_len);
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	len = vfs_iter_write(slice->pool->f, &iter, &off);
+	set_fs(old_fs);
+
+	return (len >= 0 && len != total_len) ? -EFAULT : len;
+}
+
+/**
+ * kdbus_pool_slice_copy() - copy data from one slice into another
+ * @slice_dst:		destination slice
+ * @slice_src:		source slice
+ *
+ * Return: 0 on success, negative error number on failure.
+ */
+int kdbus_pool_slice_copy(const struct kdbus_pool_slice *slice_dst,
+			  const struct kdbus_pool_slice *slice_src)
+{
+	struct file *f_src = slice_src->pool->f;
+	struct file *f_dst = slice_dst->pool->f;
+	struct inode *i_dst = file_inode(f_dst);
+	struct address_space *mapping_dst = f_dst->f_mapping;
+	const struct address_space_operations *aops = mapping_dst->a_ops;
+	unsigned long len = slice_src->size;
+	loff_t off_src = slice_src->off;
+	loff_t off_dst = slice_dst->off;
+	mm_segment_t old_fs;
+	int ret = 0;
+
+	if (WARN_ON(slice_src->size != slice_dst->size) ||
+	    WARN_ON(slice_src->free || slice_dst->free))
+		return -EINVAL;
+
+	mutex_lock(&i_dst->i_mutex);
+	old_fs = get_fs();
+	set_fs(get_ds());
+	while (len > 0) {
+		unsigned long page_off;
+		unsigned long copy_len;
+		char __user *kaddr;
+		struct page *page;
+		ssize_t n_read;
+		void *fsdata;
+		long status;
+
+		page_off = off_dst & (PAGE_CACHE_SIZE - 1);
+		copy_len = min_t(unsigned long,
+				 PAGE_CACHE_SIZE - page_off, len);
+
+		status = aops->write_begin(f_dst, mapping_dst, off_dst,
+					   copy_len, 0, &page, &fsdata);
+		if (unlikely(status < 0)) {
+			ret = status;
+			break;
+		}
+
+		kaddr = (char __force __user *)kmap(page) + page_off;
+		n_read = __vfs_read(f_src, kaddr, copy_len, &off_src);
+		kunmap(page);
+		mark_page_accessed(page);
+		flush_dcache_page(page);
+
+		if (unlikely(n_read != copy_len)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		status = aops->write_end(f_dst, mapping_dst, off_dst,
+					 copy_len, copy_len, page, fsdata);
+		if (unlikely(status != copy_len)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		off_dst += copy_len;
+		len -= copy_len;
+	}
+	set_fs(old_fs);
+	mutex_unlock(&i_dst->i_mutex);
+
+	return ret;
+}
+
+/**
+ * kdbus_pool_mmap() -  map the pool into the process
+ * @pool:		The receiver's pool
+ * @vma:		passed by mmap() syscall
+ *
+ * Return: the result of the mmap() call, negative errno on failure.
+ */
+int kdbus_pool_mmap(const struct kdbus_pool *pool, struct vm_area_struct *vma)
+{
+	/* deny write access to the pool */
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	/* do not allow to map more than the size of the file */
+	if ((vma->vm_end - vma->vm_start) > pool->size)
+		return -EFAULT;
+
+	/* replace the connection file with our shmem file */
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	vma->vm_file = get_file(pool->f);
+
+	return pool->f->f_op->mmap(pool->f, vma);
+}
diff --git a/ipc/kdbus/pool.h b/ipc/kdbus/pool.h
new file mode 100644
index 000000000..a9038213a
--- /dev/null
+++ b/ipc/kdbus/pool.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_POOL_H
+#define __KDBUS_POOL_H
+
+#include <linux/uio.h>
+
+struct kdbus_pool;
+struct kdbus_pool_slice;
+
+struct kdbus_pool *kdbus_pool_new(const char *name, size_t size);
+void kdbus_pool_free(struct kdbus_pool *pool);
+void kdbus_pool_accounted(struct kdbus_pool *pool, size_t *size, size_t *acc);
+int kdbus_pool_mmap(const struct kdbus_pool *pool, struct vm_area_struct *vma);
+int kdbus_pool_release_offset(struct kdbus_pool *pool, size_t off);
+void kdbus_pool_publish_empty(struct kdbus_pool *pool, u64 *off, u64 *size);
+
+struct kdbus_pool_slice *kdbus_pool_slice_alloc(struct kdbus_pool *pool,
+						size_t size, bool accounted);
+void kdbus_pool_slice_release(struct kdbus_pool_slice *slice);
+void kdbus_pool_slice_publish(struct kdbus_pool_slice *slice,
+			      u64 *out_offset, u64 *out_size);
+off_t kdbus_pool_slice_offset(const struct kdbus_pool_slice *slice);
+size_t kdbus_pool_slice_size(const struct kdbus_pool_slice *slice);
+int kdbus_pool_slice_copy(const struct kdbus_pool_slice *slice_dst,
+			  const struct kdbus_pool_slice *slice_src);
+ssize_t kdbus_pool_slice_copy_kvec(const struct kdbus_pool_slice *slice,
+				   loff_t off, struct kvec *kvec,
+				   size_t kvec_count, size_t total_len);
+ssize_t kdbus_pool_slice_copy_iovec(const struct kdbus_pool_slice *slice,
+				    loff_t off, struct iovec *iov,
+				    size_t iov_count, size_t total_len);
+
+#endif
diff --git a/ipc/kdbus/queue.c b/ipc/kdbus/queue.c
new file mode 100644
index 000000000..f9c44d7ba
--- /dev/null
+++ b/ipc/kdbus/queue.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/audit.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/math64.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uio.h>
+
+#include "util.h"
+#include "domain.h"
+#include "connection.h"
+#include "item.h"
+#include "message.h"
+#include "metadata.h"
+#include "queue.h"
+#include "reply.h"
+
+/**
+ * kdbus_queue_init() - initialize data structure related to a queue
+ * @queue:	The queue to initialize
+ */
+void kdbus_queue_init(struct kdbus_queue *queue)
+{
+	INIT_LIST_HEAD(&queue->msg_list);
+	queue->msg_prio_queue = RB_ROOT;
+}
+
+/**
+ * kdbus_queue_peek() - Retrieves an entry from a queue
+ * @queue:		The queue
+ * @priority:		The minimum priority of the entry to peek
+ * @use_priority:	Boolean flag whether or not to peek by priority
+ *
+ * Look for a entry in a queue, either by priority, or the oldest one (FIFO).
+ * The entry is not freed, put off the queue's lists or anything else.
+ *
+ * Return: the peeked queue entry on success, NULL if no suitable msg is found
+ */
+struct kdbus_queue_entry *kdbus_queue_peek(struct kdbus_queue *queue,
+					   s64 priority, bool use_priority)
+{
+	struct kdbus_queue_entry *e;
+
+	if (list_empty(&queue->msg_list))
+		return NULL;
+
+	if (use_priority) {
+		/* get next entry with highest priority */
+		e = rb_entry(queue->msg_prio_highest,
+			     struct kdbus_queue_entry, prio_node);
+
+		/* no entry with the requested priority */
+		if (e->priority > priority)
+			return NULL;
+	} else {
+		/* ignore the priority, return the next entry in the entry */
+		e = list_first_entry(&queue->msg_list,
+				     struct kdbus_queue_entry, entry);
+	}
+
+	return e;
+}
+
+static void kdbus_queue_entry_link(struct kdbus_queue_entry *entry)
+{
+	struct kdbus_queue *queue = &entry->conn->queue;
+	struct rb_node **n, *pn = NULL;
+	bool highest = true;
+
+	lockdep_assert_held(&entry->conn->lock);
+	if (WARN_ON(!list_empty(&entry->entry)))
+		return;
+
+	/* sort into priority entry tree */
+	n = &queue->msg_prio_queue.rb_node;
+	while (*n) {
+		struct kdbus_queue_entry *e;
+
+		pn = *n;
+		e = rb_entry(pn, struct kdbus_queue_entry, prio_node);
+
+		/* existing node for this priority, add to its list */
+		if (likely(entry->priority == e->priority)) {
+			list_add_tail(&entry->prio_entry, &e->prio_entry);
+			goto prio_done;
+		}
+
+		if (entry->priority < e->priority) {
+			n = &pn->rb_left;
+		} else {
+			n = &pn->rb_right;
+			highest = false;
+		}
+	}
+
+	/* cache highest-priority entry */
+	if (highest)
+		queue->msg_prio_highest = &entry->prio_node;
+
+	/* new node for this priority */
+	rb_link_node(&entry->prio_node, pn, n);
+	rb_insert_color(&entry->prio_node, &queue->msg_prio_queue);
+	INIT_LIST_HEAD(&entry->prio_entry);
+
+prio_done:
+	/* add to unsorted fifo list */
+	list_add_tail(&entry->entry, &queue->msg_list);
+}
+
+static void kdbus_queue_entry_unlink(struct kdbus_queue_entry *entry)
+{
+	struct kdbus_queue *queue = &entry->conn->queue;
+
+	lockdep_assert_held(&entry->conn->lock);
+	if (list_empty(&entry->entry))
+		return;
+
+	list_del_init(&entry->entry);
+
+	if (list_empty(&entry->prio_entry)) {
+		/*
+		 * Single entry for this priority, update cached
+		 * highest-priority entry, remove the tree node.
+		 */
+		if (queue->msg_prio_highest == &entry->prio_node)
+			queue->msg_prio_highest = rb_next(&entry->prio_node);
+
+		rb_erase(&entry->prio_node, &queue->msg_prio_queue);
+	} else {
+		struct kdbus_queue_entry *q;
+
+		/*
+		 * Multiple entries for this priority entry, get next one in
+		 * the list. Update cached highest-priority entry, store the
+		 * new one as the tree node.
+		 */
+		q = list_first_entry(&entry->prio_entry,
+				     struct kdbus_queue_entry, prio_entry);
+		list_del(&entry->prio_entry);
+
+		if (queue->msg_prio_highest == &entry->prio_node)
+			queue->msg_prio_highest = &q->prio_node;
+
+		rb_replace_node(&entry->prio_node, &q->prio_node,
+				&queue->msg_prio_queue);
+	}
+}
+
+/**
+ * kdbus_queue_entry_new() - allocate a queue entry
+ * @src:	source connection, or NULL
+ * @dst:	destination connection
+ * @s:		staging object carrying the message
+ *
+ * Allocates a queue entry based on a given msg and allocate space for
+ * the message payload and the requested metadata in the connection's pool.
+ * The entry is not actually added to the queue's lists at this point.
+ *
+ * Return: the allocated entry on success, or an ERR_PTR on failures.
+ */
+struct kdbus_queue_entry *kdbus_queue_entry_new(struct kdbus_conn *src,
+						struct kdbus_conn *dst,
+						struct kdbus_staging *s)
+{
+	struct kdbus_queue_entry *entry;
+	int ret;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&entry->entry);
+	entry->priority = s->msg->priority;
+	entry->conn = kdbus_conn_ref(dst);
+	entry->gaps = kdbus_gaps_ref(s->gaps);
+
+	entry->slice = kdbus_staging_emit(s, src, dst);
+	if (IS_ERR(entry->slice)) {
+		ret = PTR_ERR(entry->slice);
+		entry->slice = NULL;
+		goto error;
+	}
+
+	entry->user = src ? kdbus_user_ref(src->user) : NULL;
+	return entry;
+
+error:
+	kdbus_queue_entry_free(entry);
+	return ERR_PTR(ret);
+}
+
+/**
+ * kdbus_queue_entry_free() - free resources of an entry
+ * @entry:	The entry to free
+ *
+ * Removes resources allocated by a queue entry, along with the entry itself.
+ * Note that the entry's slice is not freed at this point.
+ */
+void kdbus_queue_entry_free(struct kdbus_queue_entry *entry)
+{
+	if (!entry)
+		return;
+
+	lockdep_assert_held(&entry->conn->lock);
+
+	kdbus_queue_entry_unlink(entry);
+	kdbus_reply_unref(entry->reply);
+
+	if (entry->slice) {
+		kdbus_conn_quota_dec(entry->conn, entry->user,
+				     kdbus_pool_slice_size(entry->slice),
+				     entry->gaps ? entry->gaps->n_fds : 0);
+		kdbus_pool_slice_release(entry->slice);
+	}
+
+	kdbus_user_unref(entry->user);
+	kdbus_gaps_unref(entry->gaps);
+	kdbus_conn_unref(entry->conn);
+	kfree(entry);
+}
+
+/**
+ * kdbus_queue_entry_install() - install message components into the
+ *				 receiver's process
+ * @entry:		The queue entry to install
+ * @return_flags:	Pointer to store the return flags for userspace
+ * @install_fds:	Whether or not to install associated file descriptors
+ *
+ * Return: 0 on success.
+ */
+int kdbus_queue_entry_install(struct kdbus_queue_entry *entry,
+			      u64 *return_flags, bool install_fds)
+{
+	bool incomplete_fds = false;
+	int ret;
+
+	lockdep_assert_held(&entry->conn->lock);
+
+	ret = kdbus_gaps_install(entry->gaps, entry->slice, &incomplete_fds);
+	if (ret < 0)
+		return ret;
+
+	if (incomplete_fds)
+		*return_flags |= KDBUS_RECV_RETURN_INCOMPLETE_FDS;
+	return 0;
+}
+
+/**
+ * kdbus_queue_entry_enqueue() - enqueue an entry
+ * @entry:		entry to enqueue
+ * @reply:		reply to link to this entry (or NULL if none)
+ *
+ * This enqueues an unqueued entry into the message queue of the linked
+ * connection. It also binds a reply object to the entry so we can remember it
+ * when the message is moved.
+ *
+ * Once this call returns (and the connection lock is released), this entry can
+ * be dequeued by the target connection. Note that the entry will not be removed
+ * from the queue until it is destroyed.
+ */
+void kdbus_queue_entry_enqueue(struct kdbus_queue_entry *entry,
+			       struct kdbus_reply *reply)
+{
+	lockdep_assert_held(&entry->conn->lock);
+
+	if (WARN_ON(entry->reply) || WARN_ON(!list_empty(&entry->entry)))
+		return;
+
+	entry->reply = kdbus_reply_ref(reply);
+	kdbus_queue_entry_link(entry);
+}
+
+/**
+ * kdbus_queue_entry_move() - move queue entry
+ * @e:		queue entry to move
+ * @dst:	destination connection to queue the entry on
+ *
+ * This moves a queue entry onto a different connection. It allocates a new
+ * slice on the target connection and copies the message over. If the copy
+ * succeeded, we move the entry from @src to @dst.
+ *
+ * On failure, the entry is left untouched.
+ *
+ * The queue entry must be queued right now, and after the call succeeds it will
+ * be queued on the destination, but no longer on the source.
+ *
+ * The caller must hold the connection lock of the source *and* destination.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_queue_entry_move(struct kdbus_queue_entry *e,
+			   struct kdbus_conn *dst)
+{
+	struct kdbus_pool_slice *slice = NULL;
+	struct kdbus_conn *src = e->conn;
+	size_t size, fds;
+	int ret;
+
+	lockdep_assert_held(&src->lock);
+	lockdep_assert_held(&dst->lock);
+
+	if (WARN_ON(list_empty(&e->entry)))
+		return -EINVAL;
+	if (src == dst)
+		return 0;
+
+	size = kdbus_pool_slice_size(e->slice);
+	fds = e->gaps ? e->gaps->n_fds : 0;
+
+	ret = kdbus_conn_quota_inc(dst, e->user, size, fds);
+	if (ret < 0)
+		return ret;
+
+	slice = kdbus_pool_slice_alloc(dst->pool, size, true);
+	if (IS_ERR(slice)) {
+		ret = PTR_ERR(slice);
+		slice = NULL;
+		goto error;
+	}
+
+	ret = kdbus_pool_slice_copy(slice, e->slice);
+	if (ret < 0)
+		goto error;
+
+	kdbus_queue_entry_unlink(e);
+	kdbus_conn_quota_dec(src, e->user, size, fds);
+	kdbus_pool_slice_release(e->slice);
+	kdbus_conn_unref(e->conn);
+
+	e->slice = slice;
+	e->conn = kdbus_conn_ref(dst);
+	kdbus_queue_entry_link(e);
+
+	return 0;
+
+error:
+	kdbus_pool_slice_release(slice);
+	kdbus_conn_quota_dec(dst, e->user, size, fds);
+	return ret;
+}
diff --git a/ipc/kdbus/queue.h b/ipc/kdbus/queue.h
new file mode 100644
index 000000000..bf686d182
--- /dev/null
+++ b/ipc/kdbus/queue.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_QUEUE_H
+#define __KDBUS_QUEUE_H
+
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
+struct kdbus_conn;
+struct kdbus_pool_slice;
+struct kdbus_reply;
+struct kdbus_staging;
+struct kdbus_user;
+
+/**
+ * struct kdbus_queue - a connection's message queue
+ * @msg_list:		List head for kdbus_queue_entry objects
+ * @msg_prio_queue:	RB tree root for messages, sorted by priority
+ * @msg_prio_highest:	Link to the RB node referencing the message with the
+ *			highest priority in the tree.
+ */
+struct kdbus_queue {
+	struct list_head msg_list;
+	struct rb_root msg_prio_queue;
+	struct rb_node *msg_prio_highest;
+};
+
+/**
+ * struct kdbus_queue_entry - messages waiting to be read
+ * @entry:		Entry in the connection's list
+ * @prio_node:		Entry in the priority queue tree
+ * @prio_entry:		Queue tree node entry in the list of one priority
+ * @priority:		Message priority
+ * @dst_name_id:	The sequence number of the name this message is
+ *			addressed to, 0 for messages sent to an ID
+ * @conn:		Connection this entry is queued on
+ * @gaps:		Gaps object to fill message gaps at RECV time
+ * @user:		User used for accounting
+ * @slice:		Slice in the receiver's pool for the message
+ * @reply:		The reply block if a reply to this message is expected
+ */
+struct kdbus_queue_entry {
+	struct list_head entry;
+	struct rb_node prio_node;
+	struct list_head prio_entry;
+
+	s64 priority;
+	u64 dst_name_id;
+
+	struct kdbus_conn *conn;
+	struct kdbus_gaps *gaps;
+	struct kdbus_user *user;
+	struct kdbus_pool_slice *slice;
+	struct kdbus_reply *reply;
+};
+
+void kdbus_queue_init(struct kdbus_queue *queue);
+struct kdbus_queue_entry *kdbus_queue_peek(struct kdbus_queue *queue,
+					   s64 priority, bool use_priority);
+
+struct kdbus_queue_entry *kdbus_queue_entry_new(struct kdbus_conn *src,
+						struct kdbus_conn *dst,
+						struct kdbus_staging *s);
+void kdbus_queue_entry_free(struct kdbus_queue_entry *entry);
+int kdbus_queue_entry_install(struct kdbus_queue_entry *entry,
+			      u64 *return_flags, bool install_fds);
+void kdbus_queue_entry_enqueue(struct kdbus_queue_entry *entry,
+			       struct kdbus_reply *reply);
+int kdbus_queue_entry_move(struct kdbus_queue_entry *entry,
+			   struct kdbus_conn *dst);
+
+#endif /* __KDBUS_QUEUE_H */
diff --git a/ipc/kdbus/reply.c b/ipc/kdbus/reply.c
new file mode 100644
index 000000000..e6791d86e
--- /dev/null
+++ b/ipc/kdbus/reply.c
@@ -0,0 +1,252 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+
+#include "bus.h"
+#include "connection.h"
+#include "endpoint.h"
+#include "message.h"
+#include "metadata.h"
+#include "names.h"
+#include "domain.h"
+#include "item.h"
+#include "notify.h"
+#include "policy.h"
+#include "reply.h"
+#include "util.h"
+
+/**
+ * kdbus_reply_new() - Allocate and set up a new kdbus_reply object
+ * @reply_src:		The connection a reply is expected from
+ * @reply_dst:		The connection this reply object belongs to
+ * @msg:		Message associated with the reply
+ * @name_entry:		Name entry used to send the message
+ * @sync:		Whether or not to make this reply synchronous
+ *
+ * Allocate and fill a new kdbus_reply object.
+ *
+ * Return: New kdbus_conn object on success, ERR_PTR on error.
+ */
+struct kdbus_reply *kdbus_reply_new(struct kdbus_conn *reply_src,
+				    struct kdbus_conn *reply_dst,
+				    const struct kdbus_msg *msg,
+				    struct kdbus_name_entry *name_entry,
+				    bool sync)
+{
+	struct kdbus_reply *r;
+	int ret;
+
+	if (atomic_inc_return(&reply_dst->request_count) >
+	    KDBUS_CONN_MAX_REQUESTS_PENDING) {
+		ret = -EMLINK;
+		goto exit_dec_request_count;
+	}
+
+	r = kzalloc(sizeof(*r), GFP_KERNEL);
+	if (!r) {
+		ret = -ENOMEM;
+		goto exit_dec_request_count;
+	}
+
+	kref_init(&r->kref);
+	INIT_LIST_HEAD(&r->entry);
+	r->reply_src = kdbus_conn_ref(reply_src);
+	r->reply_dst = kdbus_conn_ref(reply_dst);
+	r->cookie = msg->cookie;
+	r->name_id = name_entry ? name_entry->name_id : 0;
+	r->deadline_ns = msg->timeout_ns;
+
+	if (sync) {
+		r->sync = true;
+		r->waiting = true;
+	}
+
+	return r;
+
+exit_dec_request_count:
+	atomic_dec(&reply_dst->request_count);
+	return ERR_PTR(ret);
+}
+
+static void __kdbus_reply_free(struct kref *kref)
+{
+	struct kdbus_reply *reply =
+		container_of(kref, struct kdbus_reply, kref);
+
+	atomic_dec(&reply->reply_dst->request_count);
+	kdbus_conn_unref(reply->reply_src);
+	kdbus_conn_unref(reply->reply_dst);
+	kfree(reply);
+}
+
+/**
+ * kdbus_reply_ref() - Increase reference on kdbus_reply
+ * @r:		The reply, may be %NULL
+ *
+ * Return: The reply object with an extra reference
+ */
+struct kdbus_reply *kdbus_reply_ref(struct kdbus_reply *r)
+{
+	if (r)
+		kref_get(&r->kref);
+	return r;
+}
+
+/**
+ * kdbus_reply_unref() - Decrease reference on kdbus_reply
+ * @r:		The reply, may be %NULL
+ *
+ * Return: NULL
+ */
+struct kdbus_reply *kdbus_reply_unref(struct kdbus_reply *r)
+{
+	if (r)
+		kref_put(&r->kref, __kdbus_reply_free);
+	return NULL;
+}
+
+/**
+ * kdbus_reply_link() - Link reply object into target connection
+ * @r:		Reply to link
+ */
+void kdbus_reply_link(struct kdbus_reply *r)
+{
+	if (WARN_ON(!list_empty(&r->entry)))
+		return;
+
+	list_add(&r->entry, &r->reply_dst->reply_list);
+	kdbus_reply_ref(r);
+}
+
+/**
+ * kdbus_reply_unlink() - Unlink reply object from target connection
+ * @r:		Reply to unlink
+ */
+void kdbus_reply_unlink(struct kdbus_reply *r)
+{
+	if (!list_empty(&r->entry)) {
+		list_del_init(&r->entry);
+		kdbus_reply_unref(r);
+	}
+}
+
+/**
+ * kdbus_sync_reply_wakeup() - Wake a synchronously blocking reply
+ * @reply:	The reply object
+ * @err:	Error code to set on the remote side
+ *
+ * Wake up remote peer (method origin) with the appropriate synchronous reply
+ * code.
+ */
+void kdbus_sync_reply_wakeup(struct kdbus_reply *reply, int err)
+{
+	if (WARN_ON(!reply->sync))
+		return;
+
+	reply->waiting = false;
+	reply->err = err;
+	wake_up_interruptible(&reply->reply_dst->wait);
+}
+
+/**
+ * kdbus_reply_find() - Find the corresponding reply object
+ * @replying:	The replying connection or NULL
+ * @reply_dst:	The connection the reply will be sent to
+ *		(method origin)
+ * @cookie:	The cookie of the requesting message
+ *
+ * Lookup a reply object that should be sent as a reply by
+ * @replying to @reply_dst with the given cookie.
+ *
+ * Callers must take the @reply_dst lock.
+ *
+ * Return: the corresponding reply object or NULL if not found
+ */
+struct kdbus_reply *kdbus_reply_find(struct kdbus_conn *replying,
+				     struct kdbus_conn *reply_dst,
+				     u64 cookie)
+{
+	struct kdbus_reply *r;
+
+	list_for_each_entry(r, &reply_dst->reply_list, entry) {
+		if (r->cookie == cookie &&
+		    (!replying || r->reply_src == replying))
+			return r;
+	}
+
+	return NULL;
+}
+
+/**
+ * kdbus_reply_list_scan_work() - Worker callback to scan the replies of a
+ *				  connection for exceeded timeouts
+ * @work:		Work struct of the connection to scan
+ *
+ * Walk the list of replies stored with a connection and look for entries
+ * that have exceeded their timeout. If such an entry is found, a timeout
+ * notification is sent to the waiting peer, and the reply is removed from
+ * the list.
+ *
+ * The work is rescheduled to the nearest timeout found during the list
+ * iteration.
+ */
+void kdbus_reply_list_scan_work(struct work_struct *work)
+{
+	struct kdbus_conn *conn =
+		container_of(work, struct kdbus_conn, work.work);
+	struct kdbus_reply *reply, *reply_tmp;
+	u64 deadline = ~0ULL;
+	u64 now;
+
+	now = ktime_get_ns();
+
+	mutex_lock(&conn->lock);
+	if (!kdbus_conn_active(conn)) {
+		mutex_unlock(&conn->lock);
+		return;
+	}
+
+	list_for_each_entry_safe(reply, reply_tmp, &conn->reply_list, entry) {
+		/*
+		 * If the reply block is waiting for synchronous I/O,
+		 * the timeout is handled by wait_event_*_timeout(),
+		 * so we don't have to care for it here.
+		 */
+		if (reply->sync && !reply->interrupted)
+			continue;
+
+		WARN_ON(reply->reply_dst != conn);
+
+		if (reply->deadline_ns > now) {
+			/* remember next timeout */
+			if (deadline > reply->deadline_ns)
+				deadline = reply->deadline_ns;
+
+			continue;
+		}
+
+		/*
+		 * A zero deadline means the connection died, was
+		 * cleaned up already and the notification was sent.
+		 * Don't send notifications for reply trackers that were
+		 * left in an interrupted syscall state.
+		 */
+		if (reply->deadline_ns != 0 && !reply->interrupted)
+			kdbus_notify_reply_timeout(conn->ep->bus, conn->id,
+						   reply->cookie);
+
+		kdbus_reply_unlink(reply);
+	}
+
+	/* rearm delayed work with next timeout */
+	if (deadline != ~0ULL)
+		schedule_delayed_work(&conn->work,
+				      nsecs_to_jiffies(deadline - now));
+
+	mutex_unlock(&conn->lock);
+
+	kdbus_notify_flush(conn->ep->bus);
+}
diff --git a/ipc/kdbus/reply.h b/ipc/kdbus/reply.h
new file mode 100644
index 000000000..68d52321a
--- /dev/null
+++ b/ipc/kdbus/reply.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_REPLY_H
+#define __KDBUS_REPLY_H
+
+/**
+ * struct kdbus_reply - an entry of kdbus_conn's list of replies
+ * @kref:		Ref-count of this object
+ * @entry:		The entry of the connection's reply_list
+ * @reply_src:		The connection the reply will be sent from
+ * @reply_dst:		The connection the reply will be sent to
+ * @queue_entry:	The queue entry item that is prepared by the replying
+ *			connection
+ * @deadline_ns:	The deadline of the reply, in nanoseconds
+ * @cookie:		The cookie of the requesting message
+ * @name_id:		ID of the well-known name the original msg was sent to
+ * @sync:		The reply block is waiting for synchronous I/O
+ * @waiting:		The condition to synchronously wait for
+ * @interrupted:	The sync reply was left in an interrupted state
+ * @err:		The error code for the synchronous reply
+ */
+struct kdbus_reply {
+	struct kref kref;
+	struct list_head entry;
+	struct kdbus_conn *reply_src;
+	struct kdbus_conn *reply_dst;
+	struct kdbus_queue_entry *queue_entry;
+	u64 deadline_ns;
+	u64 cookie;
+	u64 name_id;
+	bool sync:1;
+	bool waiting:1;
+	bool interrupted:1;
+	int err;
+};
+
+struct kdbus_reply *kdbus_reply_new(struct kdbus_conn *reply_src,
+				    struct kdbus_conn *reply_dst,
+				    const struct kdbus_msg *msg,
+				    struct kdbus_name_entry *name_entry,
+				    bool sync);
+
+struct kdbus_reply *kdbus_reply_ref(struct kdbus_reply *r);
+struct kdbus_reply *kdbus_reply_unref(struct kdbus_reply *r);
+
+void kdbus_reply_link(struct kdbus_reply *r);
+void kdbus_reply_unlink(struct kdbus_reply *r);
+
+struct kdbus_reply *kdbus_reply_find(struct kdbus_conn *replying,
+				     struct kdbus_conn *reply_dst,
+				     u64 cookie);
+
+void kdbus_sync_reply_wakeup(struct kdbus_reply *reply, int err);
+void kdbus_reply_list_scan_work(struct work_struct *work);
+
+#endif /* __KDBUS_REPLY_H */
diff --git a/ipc/kdbus/util.c b/ipc/kdbus/util.c
new file mode 100644
index 000000000..72b188330
--- /dev/null
+++ b/ipc/kdbus/util.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/capability.h>
+#include <linux/cred.h>
+#include <linux/ctype.h>
+#include <linux/err.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/user_namespace.h>
+
+#include "limits.h"
+#include "util.h"
+
+/**
+ * kdbus_copy_from_user() - copy aligned data from user-space
+ * @dest:	target buffer in kernel memory
+ * @user_ptr:	user-provided source buffer
+ * @size:	memory size to copy from user
+ *
+ * This copies @size bytes from @user_ptr into the kernel, just like
+ * copy_from_user() does. But we enforce an 8-byte alignment and reject any
+ * unaligned user-space pointers.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kdbus_copy_from_user(void *dest, void __user *user_ptr, size_t size)
+{
+	if (!KDBUS_IS_ALIGNED8((uintptr_t)user_ptr))
+		return -EFAULT;
+
+	if (copy_from_user(dest, user_ptr, size))
+		return -EFAULT;
+
+	return 0;
+}
+
+/**
+ * kdbus_verify_uid_prefix() - verify UID prefix of a user-supplied name
+ * @name:	user-supplied name to verify
+ * @user_ns:	user-namespace to act in
+ * @kuid:	Kernel internal uid of user
+ *
+ * This verifies that the user-supplied name @name has their UID as prefix. This
+ * is the default name-spacing policy we enforce on user-supplied names for
+ * public kdbus entities like buses and endpoints.
+ *
+ * The user must supply names prefixed with "<UID>-", whereas the UID is
+ * interpreted in the user-namespace of the domain. If the user fails to supply
+ * such a prefixed name, we reject it.
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int kdbus_verify_uid_prefix(const char *name, struct user_namespace *user_ns,
+			    kuid_t kuid)
+{
+	uid_t uid;
+	char prefix[16];
+
+	/*
+	 * The kuid must have a mapping into the userns of the domain
+	 * otherwise do not allow creation of buses nor endpoints.
+	 */
+	uid = from_kuid(user_ns, kuid);
+	if (uid == (uid_t) -1)
+		return -EINVAL;
+
+	snprintf(prefix, sizeof(prefix), "%u-", uid);
+	if (strncmp(name, prefix, strlen(prefix)) != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * kdbus_sanitize_attach_flags() - Sanitize attach flags from user-space
+ * @flags:		Attach flags provided by userspace
+ * @attach_flags:	A pointer where to store the valid attach flags
+ *
+ * Convert attach-flags provided by user-space into a valid mask. If the mask
+ * is invalid, an error is returned. The sanitized attach flags are stored in
+ * the output parameter.
+ *
+ * Return: 0 on success, negative error on failure.
+ */
+int kdbus_sanitize_attach_flags(u64 flags, u64 *attach_flags)
+{
+	/* 'any' degrades to 'all' for compatibility */
+	if (flags == _KDBUS_ATTACH_ANY)
+		flags = _KDBUS_ATTACH_ALL;
+
+	/* reject unknown attach flags */
+	if (flags & ~_KDBUS_ATTACH_ALL)
+		return -EINVAL;
+
+	*attach_flags = flags;
+	return 0;
+}
+
+/**
+ * kdbus_kvec_set - helper utility to assemble kvec arrays
+ * @kvec:	kvec entry to use
+ * @src:	Source address to set in @kvec
+ * @len:	Number of bytes in @src
+ * @total_len:	Pointer to total length variable
+ *
+ * Set @src and @len in @kvec, and increase @total_len by @len.
+ */
+void kdbus_kvec_set(struct kvec *kvec, void *src, size_t len, u64 *total_len)
+{
+	kvec->iov_base = src;
+	kvec->iov_len = len;
+	*total_len += len;
+}
+
+static const char * const zeros = "\0\0\0\0\0\0\0";
+
+/**
+ * kdbus_kvec_pad - conditionally write a padding kvec
+ * @kvec:	kvec entry to use
+ * @len:	Total length used for kvec array
+ *
+ * Check if the current total byte length of the array in @len is aligned to
+ * 8 bytes. If it isn't, fill @kvec with padding information and increase @len
+ * by the number of bytes stored in @kvec.
+ *
+ * Return: the number of added padding bytes.
+ */
+size_t kdbus_kvec_pad(struct kvec *kvec, u64 *len)
+{
+	size_t pad = KDBUS_ALIGN8(*len) - *len;
+
+	if (!pad)
+		return 0;
+
+	kvec->iov_base = (void *)zeros;
+	kvec->iov_len = pad;
+
+	*len += pad;
+
+	return pad;
+}
diff --git a/ipc/kdbus/util.h b/ipc/kdbus/util.h
new file mode 100644
index 000000000..529716669
--- /dev/null
+++ b/ipc/kdbus/util.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2013-2015 Kay Sievers
+ * Copyright (C) 2013-2015 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ * Copyright (C) 2013-2015 Daniel Mack <daniel@zonque.org>
+ * Copyright (C) 2013-2015 David Herrmann <dh.herrmann@gmail.com>
+ * Copyright (C) 2013-2015 Linux Foundation
+ * Copyright (C) 2014-2015 Djalal Harouni <tixxdz@opendz.org>
+ *
+ * kdbus is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ */
+
+#ifndef __KDBUS_UTIL_H
+#define __KDBUS_UTIL_H
+
+#include <linux/dcache.h>
+#include <linux/ioctl.h>
+
+#include <uapi/linux/kdbus.h>
+
+/* all exported addresses are 64 bit */
+#define KDBUS_PTR(addr) ((void __user *)(uintptr_t)(addr))
+
+/* all exported sizes are 64 bit and data aligned to 64 bit */
+#define KDBUS_ALIGN8(s) ALIGN((s), 8)
+#define KDBUS_IS_ALIGNED8(s) (IS_ALIGNED(s, 8))
+
+/**
+ * kdbus_member_set_user - write a structure member to user memory
+ * @_s:		Variable to copy from
+ * @_b:		Buffer to write to
+ * @_t:		Structure type
+ * @_m:		Member name in the passed structure
+ *
+ * Return: the result of copy_to_user()
+ */
+#define kdbus_member_set_user(_s, _b, _t, _m)				\
+({									\
+	u64 __user *_sz =						\
+		(void __user *)((u8 __user *)(_b) + offsetof(_t, _m));	\
+	copy_to_user(_sz, _s, FIELD_SIZEOF(_t, _m));			\
+})
+
+/**
+ * kdbus_strhash - calculate a hash
+ * @str:	String
+ *
+ * Return: hash value
+ */
+static inline unsigned int kdbus_strhash(const char *str)
+{
+	unsigned long hash = init_name_hash();
+
+	while (*str)
+		hash = partial_name_hash(*str++, hash);
+
+	return end_name_hash(hash);
+}
+
+int kdbus_verify_uid_prefix(const char *name, struct user_namespace *user_ns,
+			    kuid_t kuid);
+int kdbus_sanitize_attach_flags(u64 flags, u64 *attach_flags);
+
+int kdbus_copy_from_user(void *dest, void __user *user_ptr, size_t size);
+
+struct kvec;
+
+void kdbus_kvec_set(struct kvec *kvec, void *src, size_t len, u64 *total_len);
+size_t kdbus_kvec_pad(struct kvec *kvec, u64 *len);
+
+#endif
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
new file mode 100644
index 000000000..68d4e9537
--- /dev/null
+++ b/ipc/mq_sysctl.c
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (C) 2007 IBM Corporation
+ *
+ *  Author: Cedric Le Goater <clg@fr.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+#include <linux/sysctl.h>
+
+#ifdef CONFIG_PROC_SYSCTL
+static void *get_mq(struct ctl_table *table)
+{
+	char *which = table->data;
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+	return which;
+}
+
+static int proc_mq_dointvec(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table mq_table;
+	memcpy(&mq_table, table, sizeof(mq_table));
+	mq_table.data = get_mq(table);
+
+	return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
+}
+
+static int proc_mq_dointvec_minmax(struct ctl_table *table, int write,
+	void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table mq_table;
+	memcpy(&mq_table, table, sizeof(mq_table));
+	mq_table.data = get_mq(table);
+
+	return proc_dointvec_minmax(&mq_table, write, buffer,
+					lenp, ppos);
+}
+#else
+#define proc_mq_dointvec NULL
+#define proc_mq_dointvec_minmax NULL
+#endif
+
+static int msg_max_limit_min = MIN_MSGMAX;
+static int msg_max_limit_max = HARD_MSGMAX;
+
+static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
+static int msg_maxsize_limit_max = HARD_MSGSIZEMAX;
+
+static struct ctl_table mq_sysctls[] = {
+	{
+		.procname	= "queues_max",
+		.data		= &init_ipc_ns.mq_queues_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_mq_dointvec,
+	},
+	{
+		.procname	= "msg_max",
+		.data		= &init_ipc_ns.mq_msg_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_mq_dointvec_minmax,
+		.extra1		= &msg_max_limit_min,
+		.extra2		= &msg_max_limit_max,
+	},
+	{
+		.procname	= "msgsize_max",
+		.data		= &init_ipc_ns.mq_msgsize_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_mq_dointvec_minmax,
+		.extra1		= &msg_maxsize_limit_min,
+		.extra2		= &msg_maxsize_limit_max,
+	},
+	{
+		.procname	= "msg_default",
+		.data		= &init_ipc_ns.mq_msg_default,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_mq_dointvec_minmax,
+		.extra1		= &msg_max_limit_min,
+		.extra2		= &msg_max_limit_max,
+	},
+	{
+		.procname	= "msgsize_default",
+		.data		= &init_ipc_ns.mq_msgsize_default,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_mq_dointvec_minmax,
+		.extra1		= &msg_maxsize_limit_min,
+		.extra2		= &msg_maxsize_limit_max,
+	},
+	{}
+};
+
+static struct ctl_table mq_sysctl_dir[] = {
+	{
+		.procname	= "mqueue",
+		.mode		= 0555,
+		.child		= mq_sysctls,
+	},
+	{}
+};
+
+static struct ctl_table mq_sysctl_root[] = {
+	{
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= mq_sysctl_dir,
+	},
+	{}
+};
+
+struct ctl_table_header *mq_register_sysctl_table(void)
+{
+	return register_sysctl_table(mq_sysctl_root);
+}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
new file mode 100644
index 000000000..3aaea7ffd
--- /dev/null
+++ b/ipc/mqueue.c
@@ -0,0 +1,1462 @@
+/*
+ * POSIX message queues filesystem for Linux.
+ *
+ * Copyright (C) 2003,2004  Krzysztof Benedyczak    (golbi@mat.uni.torun.pl)
+ *                          Michal Wronski          (michal.wronski@gmail.com)
+ *
+ * Spinlocks:               Mohamed Abbas           (abbas.mohamed@intel.com)
+ * Lockless receive & send, fd based notify:
+ *			    Manfred Spraul	    (manfred@colorfullife.com)
+ *
+ * Audit:                   George Wilson           (ltcgcw@us.ibm.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/capability.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sysctl.h>
+#include <linux/poll.h>
+#include <linux/mqueue.h>
+#include <linux/msg.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/netlink.h>
+#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/signal.h>
+#include <linux/mutex.h>
+#include <linux/nsproxy.h>
+#include <linux/pid.h>
+#include <linux/ipc_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/slab.h>
+
+#include <net/sock.h>
+#include "util.h"
+
+#define MQUEUE_MAGIC	0x19800202
+#define DIRENT_SIZE	20
+#define FILENT_SIZE	80
+
+#define SEND		0
+#define RECV		1
+
+#define STATE_NONE	0
+#define STATE_PENDING	1
+#define STATE_READY	2
+
+struct posix_msg_tree_node {
+	struct rb_node		rb_node;
+	struct list_head	msg_list;
+	int			priority;
+};
+
+struct ext_wait_queue {		/* queue of sleeping tasks */
+	struct task_struct *task;
+	struct list_head list;
+	struct msg_msg *msg;	/* ptr of loaded message */
+	int state;		/* one of STATE_* values */
+};
+
+struct mqueue_inode_info {
+	spinlock_t lock;
+	struct inode vfs_inode;
+	wait_queue_head_t wait_q;
+
+	struct rb_root msg_tree;
+	struct posix_msg_tree_node *node_cache;
+	struct mq_attr attr;
+
+	struct sigevent notify;
+	struct pid *notify_owner;
+	struct user_namespace *notify_user_ns;
+	struct user_struct *user;	/* user who created, for accounting */
+	struct sock *notify_sock;
+	struct sk_buff *notify_cookie;
+
+	/* for tasks waiting for free space and messages, respectively */
+	struct ext_wait_queue e_wait_q[2];
+
+	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
+};
+
+static const struct inode_operations mqueue_dir_inode_operations;
+static const struct file_operations mqueue_file_operations;
+static const struct super_operations mqueue_super_ops;
+static void remove_notification(struct mqueue_inode_info *info);
+
+static struct kmem_cache *mqueue_inode_cachep;
+
+static struct ctl_table_header *mq_sysctl_table;
+
+static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
+{
+	return container_of(inode, struct mqueue_inode_info, vfs_inode);
+}
+
+/*
+ * This routine should be called with the mq_lock held.
+ */
+static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
+{
+	return get_ipc_ns(inode->i_sb->s_fs_info);
+}
+
+static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
+{
+	struct ipc_namespace *ns;
+
+	spin_lock(&mq_lock);
+	ns = __get_ns_from_inode(inode);
+	spin_unlock(&mq_lock);
+	return ns;
+}
+
+/* Auxiliary functions to manipulate messages' list */
+static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
+{
+	struct rb_node **p, *parent = NULL;
+	struct posix_msg_tree_node *leaf;
+
+	p = &info->msg_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
+
+		if (likely(leaf->priority == msg->m_type))
+			goto insert_msg;
+		else if (msg->m_type < leaf->priority)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	if (info->node_cache) {
+		leaf = info->node_cache;
+		info->node_cache = NULL;
+	} else {
+		leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
+		if (!leaf)
+			return -ENOMEM;
+		INIT_LIST_HEAD(&leaf->msg_list);
+		info->qsize += sizeof(*leaf);
+	}
+	leaf->priority = msg->m_type;
+	rb_link_node(&leaf->rb_node, parent, p);
+	rb_insert_color(&leaf->rb_node, &info->msg_tree);
+insert_msg:
+	info->attr.mq_curmsgs++;
+	info->qsize += msg->m_ts;
+	list_add_tail(&msg->m_list, &leaf->msg_list);
+	return 0;
+}
+
+static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
+{
+	struct rb_node **p, *parent = NULL;
+	struct posix_msg_tree_node *leaf;
+	struct msg_msg *msg;
+
+try_again:
+	p = &info->msg_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		/*
+		 * During insert, low priorities go to the left and high to the
+		 * right.  On receive, we want the highest priorities first, so
+		 * walk all the way to the right.
+		 */
+		p = &(*p)->rb_right;
+	}
+	if (!parent) {
+		if (info->attr.mq_curmsgs) {
+			pr_warn_once("Inconsistency in POSIX message queue, "
+				     "no tree element, but supposedly messages "
+				     "should exist!\n");
+			info->attr.mq_curmsgs = 0;
+		}
+		return NULL;
+	}
+	leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
+	if (unlikely(list_empty(&leaf->msg_list))) {
+		pr_warn_once("Inconsistency in POSIX message queue, "
+			     "empty leaf node but we haven't implemented "
+			     "lazy leaf delete!\n");
+		rb_erase(&leaf->rb_node, &info->msg_tree);
+		if (info->node_cache) {
+			info->qsize -= sizeof(*leaf);
+			kfree(leaf);
+		} else {
+			info->node_cache = leaf;
+		}
+		goto try_again;
+	} else {
+		msg = list_first_entry(&leaf->msg_list,
+				       struct msg_msg, m_list);
+		list_del(&msg->m_list);
+		if (list_empty(&leaf->msg_list)) {
+			rb_erase(&leaf->rb_node, &info->msg_tree);
+			if (info->node_cache) {
+				info->qsize -= sizeof(*leaf);
+				kfree(leaf);
+			} else {
+				info->node_cache = leaf;
+			}
+		}
+	}
+	info->attr.mq_curmsgs--;
+	info->qsize -= msg->m_ts;
+	return msg;
+}
+
+static struct inode *mqueue_get_inode(struct super_block *sb,
+		struct ipc_namespace *ipc_ns, umode_t mode,
+		struct mq_attr *attr)
+{
+	struct user_struct *u = current_user();
+	struct inode *inode;
+	int ret = -ENOMEM;
+
+	inode = new_inode(sb);
+	if (!inode)
+		goto err;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_mtime = inode->i_ctime = inode->i_atime = CURRENT_TIME;
+
+	if (S_ISREG(mode)) {
+		struct mqueue_inode_info *info;
+		unsigned long mq_bytes, mq_treesize;
+
+		inode->i_fop = &mqueue_file_operations;
+		inode->i_size = FILENT_SIZE;
+		/* mqueue specific info */
+		info = MQUEUE_I(inode);
+		spin_lock_init(&info->lock);
+		init_waitqueue_head(&info->wait_q);
+		INIT_LIST_HEAD(&info->e_wait_q[0].list);
+		INIT_LIST_HEAD(&info->e_wait_q[1].list);
+		info->notify_owner = NULL;
+		info->notify_user_ns = NULL;
+		info->qsize = 0;
+		info->user = NULL;	/* set when all is ok */
+		info->msg_tree = RB_ROOT;
+		info->node_cache = NULL;
+		memset(&info->attr, 0, sizeof(info->attr));
+		info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
+					   ipc_ns->mq_msg_default);
+		info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
+					    ipc_ns->mq_msgsize_default);
+		if (attr) {
+			info->attr.mq_maxmsg = attr->mq_maxmsg;
+			info->attr.mq_msgsize = attr->mq_msgsize;
+		}
+		/*
+		 * We used to allocate a static array of pointers and account
+		 * the size of that array as well as one msg_msg struct per
+		 * possible message into the queue size. That's no longer
+		 * accurate as the queue is now an rbtree and will grow and
+		 * shrink depending on usage patterns.  We can, however, still
+		 * account one msg_msg struct per message, but the nodes are
+		 * allocated depending on priority usage, and most programs
+		 * only use one, or a handful, of priorities.  However, since
+		 * this is pinned memory, we need to assume worst case, so
+		 * that means the min(mq_maxmsg, max_priorities) * struct
+		 * posix_msg_tree_node.
+		 */
+		mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
+			min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
+			sizeof(struct posix_msg_tree_node);
+
+		mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
+					  info->attr.mq_msgsize);
+
+		spin_lock(&mq_lock);
+		if (u->mq_bytes + mq_bytes < u->mq_bytes ||
+		    u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
+			spin_unlock(&mq_lock);
+			/* mqueue_evict_inode() releases info->messages */
+			ret = -EMFILE;
+			goto out_inode;
+		}
+		u->mq_bytes += mq_bytes;
+		spin_unlock(&mq_lock);
+
+		/* all is ok */
+		info->user = get_uid(u);
+	} else if (S_ISDIR(mode)) {
+		inc_nlink(inode);
+		/* Some things misbehave if size == 0 on a directory */
+		inode->i_size = 2 * DIRENT_SIZE;
+		inode->i_op = &mqueue_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+	}
+
+	return inode;
+out_inode:
+	iput(inode);
+err:
+	return ERR_PTR(ret);
+}
+
+static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+	struct ipc_namespace *ns = data;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = MQUEUE_MAGIC;
+	sb->s_op = &mqueue_super_ops;
+
+	inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	sb->s_root = d_make_root(inode);
+	if (!sb->s_root)
+		return -ENOMEM;
+	return 0;
+}
+
+static struct dentry *mqueue_mount(struct file_system_type *fs_type,
+			 int flags, const char *dev_name,
+			 void *data)
+{
+	if (!(flags & MS_KERNMOUNT)) {
+		struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+		/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+		 * over the ipc namespace.
+		 */
+		if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+			return ERR_PTR(-EPERM);
+
+		data = ns;
+	}
+	return mount_ns(fs_type, flags, data, mqueue_fill_super);
+}
+
+static void init_once(void *foo)
+{
+	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
+
+	inode_init_once(&p->vfs_inode);
+}
+
+static struct inode *mqueue_alloc_inode(struct super_block *sb)
+{
+	struct mqueue_inode_info *ei;
+
+	ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void mqueue_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
+}
+
+static void mqueue_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, mqueue_i_callback);
+}
+
+static void mqueue_evict_inode(struct inode *inode)
+{
+	struct mqueue_inode_info *info;
+	struct user_struct *user;
+	unsigned long mq_bytes, mq_treesize;
+	struct ipc_namespace *ipc_ns;
+	struct msg_msg *msg;
+
+	clear_inode(inode);
+
+	if (S_ISDIR(inode->i_mode))
+		return;
+
+	ipc_ns = get_ns_from_inode(inode);
+	info = MQUEUE_I(inode);
+	spin_lock(&info->lock);
+	while ((msg = msg_get(info)) != NULL)
+		free_msg(msg);
+	kfree(info->node_cache);
+	spin_unlock(&info->lock);
+
+	/* Total amount of bytes accounted for the mqueue */
+	mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
+		min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
+		sizeof(struct posix_msg_tree_node);
+
+	mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
+				  info->attr.mq_msgsize);
+
+	user = info->user;
+	if (user) {
+		spin_lock(&mq_lock);
+		user->mq_bytes -= mq_bytes;
+		/*
+		 * get_ns_from_inode() ensures that the
+		 * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
+		 * to which we now hold a reference, or it is NULL.
+		 * We can't put it here under mq_lock, though.
+		 */
+		if (ipc_ns)
+			ipc_ns->mq_queues_count--;
+		spin_unlock(&mq_lock);
+		free_uid(user);
+	}
+	if (ipc_ns)
+		put_ipc_ns(ipc_ns);
+}
+
+static int mqueue_create(struct inode *dir, struct dentry *dentry,
+				umode_t mode, bool excl)
+{
+	struct inode *inode;
+	struct mq_attr *attr = dentry->d_fsdata;
+	int error;
+	struct ipc_namespace *ipc_ns;
+
+	spin_lock(&mq_lock);
+	ipc_ns = __get_ns_from_inode(dir);
+	if (!ipc_ns) {
+		error = -EACCES;
+		goto out_unlock;
+	}
+
+	if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
+	    !capable(CAP_SYS_RESOURCE)) {
+		error = -ENOSPC;
+		goto out_unlock;
+	}
+	ipc_ns->mq_queues_count++;
+	spin_unlock(&mq_lock);
+
+	inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		spin_lock(&mq_lock);
+		ipc_ns->mq_queues_count--;
+		goto out_unlock;
+	}
+
+	put_ipc_ns(ipc_ns);
+	dir->i_size += DIRENT_SIZE;
+	dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
+
+	d_instantiate(dentry, inode);
+	dget(dentry);
+	return 0;
+out_unlock:
+	spin_unlock(&mq_lock);
+	if (ipc_ns)
+		put_ipc_ns(ipc_ns);
+	return error;
+}
+
+static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+
+	dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
+	dir->i_size -= DIRENT_SIZE;
+	drop_nlink(inode);
+	dput(dentry);
+	return 0;
+}
+
+/*
+*	This is routine for system read from queue file.
+*	To avoid mess with doing here some sort of mq_receive we allow
+*	to read only queue size & notification info (the only values
+*	that are interesting from user point of view and aren't accessible
+*	through std routines)
+*/
+static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
+				size_t count, loff_t *off)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+	char buffer[FILENT_SIZE];
+	ssize_t ret;
+
+	spin_lock(&info->lock);
+	snprintf(buffer, sizeof(buffer),
+			"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
+			info->qsize,
+			info->notify_owner ? info->notify.sigev_notify : 0,
+			(info->notify_owner &&
+			 info->notify.sigev_notify == SIGEV_SIGNAL) ?
+				info->notify.sigev_signo : 0,
+			pid_vnr(info->notify_owner));
+	spin_unlock(&info->lock);
+	buffer[sizeof(buffer)-1] = '\0';
+
+	ret = simple_read_from_buffer(u_data, count, off, buffer,
+				strlen(buffer));
+	if (ret <= 0)
+		return ret;
+
+	file_inode(filp)->i_atime = file_inode(filp)->i_ctime = CURRENT_TIME;
+	return ret;
+}
+
+static int mqueue_flush_file(struct file *filp, fl_owner_t id)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+
+	spin_lock(&info->lock);
+	if (task_tgid(current) == info->notify_owner)
+		remove_notification(info);
+
+	spin_unlock(&info->lock);
+	return 0;
+}
+
+static unsigned int mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+	int retval = 0;
+
+	poll_wait(filp, &info->wait_q, poll_tab);
+
+	spin_lock(&info->lock);
+	if (info->attr.mq_curmsgs)
+		retval = POLLIN | POLLRDNORM;
+
+	if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
+		retval |= POLLOUT | POLLWRNORM;
+	spin_unlock(&info->lock);
+
+	return retval;
+}
+
+/* Adds current to info->e_wait_q[sr] before element with smaller prio */
+static void wq_add(struct mqueue_inode_info *info, int sr,
+			struct ext_wait_queue *ewp)
+{
+	struct ext_wait_queue *walk;
+
+	ewp->task = current;
+
+	list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
+		if (walk->task->static_prio <= current->static_prio) {
+			list_add_tail(&ewp->list, &walk->list);
+			return;
+		}
+	}
+	list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
+}
+
+/*
+ * Puts current task to sleep. Caller must hold queue lock. After return
+ * lock isn't held.
+ * sr: SEND or RECV
+ */
+static int wq_sleep(struct mqueue_inode_info *info, int sr,
+		    ktime_t *timeout, struct ext_wait_queue *ewp)
+{
+	int retval;
+	signed long time;
+
+	wq_add(info, sr, ewp);
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_unlock(&info->lock);
+		time = schedule_hrtimeout_range_clock(timeout, 0,
+			HRTIMER_MODE_ABS, CLOCK_REALTIME);
+
+		while (ewp->state == STATE_PENDING)
+			cpu_relax();
+
+		if (ewp->state == STATE_READY) {
+			retval = 0;
+			goto out;
+		}
+		spin_lock(&info->lock);
+		if (ewp->state == STATE_READY) {
+			retval = 0;
+			goto out_unlock;
+		}
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		if (time == 0) {
+			retval = -ETIMEDOUT;
+			break;
+		}
+	}
+	list_del(&ewp->list);
+out_unlock:
+	spin_unlock(&info->lock);
+out:
+	return retval;
+}
+
+/*
+ * Returns waiting task that should be serviced first or NULL if none exists
+ */
+static struct ext_wait_queue *wq_get_first_waiter(
+		struct mqueue_inode_info *info, int sr)
+{
+	struct list_head *ptr;
+
+	ptr = info->e_wait_q[sr].list.prev;
+	if (ptr == &info->e_wait_q[sr].list)
+		return NULL;
+	return list_entry(ptr, struct ext_wait_queue, list);
+}
+
+
+static inline void set_cookie(struct sk_buff *skb, char code)
+{
+	((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
+}
+
+/*
+ * The next function is only to split too long sys_mq_timedsend
+ */
+static void __do_notify(struct mqueue_inode_info *info)
+{
+	/* notification
+	 * invoked when there is registered process and there isn't process
+	 * waiting synchronously for message AND state of queue changed from
+	 * empty to not empty. Here we are sure that no one is waiting
+	 * synchronously. */
+	if (info->notify_owner &&
+	    info->attr.mq_curmsgs == 1) {
+		struct siginfo sig_i;
+		switch (info->notify.sigev_notify) {
+		case SIGEV_NONE:
+			break;
+		case SIGEV_SIGNAL:
+			/* sends signal */
+
+			sig_i.si_signo = info->notify.sigev_signo;
+			sig_i.si_errno = 0;
+			sig_i.si_code = SI_MESGQ;
+			sig_i.si_value = info->notify.sigev_value;
+			/* map current pid/uid into info->owner's namespaces */
+			rcu_read_lock();
+			sig_i.si_pid = task_tgid_nr_ns(current,
+						ns_of_pid(info->notify_owner));
+			sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
+			rcu_read_unlock();
+
+			kill_pid_info(info->notify.sigev_signo,
+				      &sig_i, info->notify_owner);
+			break;
+		case SIGEV_THREAD:
+			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
+			netlink_sendskb(info->notify_sock, info->notify_cookie);
+			break;
+		}
+		/* after notification unregisters process */
+		put_pid(info->notify_owner);
+		put_user_ns(info->notify_user_ns);
+		info->notify_owner = NULL;
+		info->notify_user_ns = NULL;
+	}
+	wake_up(&info->wait_q);
+}
+
+static int prepare_timeout(const struct timespec __user *u_abs_timeout,
+			   ktime_t *expires, struct timespec *ts)
+{
+	if (copy_from_user(ts, u_abs_timeout, sizeof(struct timespec)))
+		return -EFAULT;
+	if (!timespec_valid(ts))
+		return -EINVAL;
+
+	*expires = timespec_to_ktime(*ts);
+	return 0;
+}
+
+static void remove_notification(struct mqueue_inode_info *info)
+{
+	if (info->notify_owner != NULL &&
+	    info->notify.sigev_notify == SIGEV_THREAD) {
+		set_cookie(info->notify_cookie, NOTIFY_REMOVED);
+		netlink_sendskb(info->notify_sock, info->notify_cookie);
+	}
+	put_pid(info->notify_owner);
+	put_user_ns(info->notify_user_ns);
+	info->notify_owner = NULL;
+	info->notify_user_ns = NULL;
+}
+
+static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
+{
+	int mq_treesize;
+	unsigned long total_size;
+
+	if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
+		return -EINVAL;
+	if (capable(CAP_SYS_RESOURCE)) {
+		if (attr->mq_maxmsg > HARD_MSGMAX ||
+		    attr->mq_msgsize > HARD_MSGSIZEMAX)
+			return -EINVAL;
+	} else {
+		if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||
+				attr->mq_msgsize > ipc_ns->mq_msgsize_max)
+			return -EINVAL;
+	}
+	/* check for overflow */
+	if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg)
+		return -EOVERFLOW;
+	mq_treesize = attr->mq_maxmsg * sizeof(struct msg_msg) +
+		min_t(unsigned int, attr->mq_maxmsg, MQ_PRIO_MAX) *
+		sizeof(struct posix_msg_tree_node);
+	total_size = attr->mq_maxmsg * attr->mq_msgsize;
+	if (total_size + mq_treesize < total_size)
+		return -EOVERFLOW;
+	return 0;
+}
+
+/*
+ * Invoked when creating a new queue via sys_mq_open
+ */
+static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
+			struct path *path, int oflag, umode_t mode,
+			struct mq_attr *attr)
+{
+	const struct cred *cred = current_cred();
+	int ret;
+
+	if (attr) {
+		ret = mq_attr_ok(ipc_ns, attr);
+		if (ret)
+			return ERR_PTR(ret);
+		/* store for use during create */
+		path->dentry->d_fsdata = attr;
+	} else {
+		struct mq_attr def_attr;
+
+		def_attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
+					 ipc_ns->mq_msg_default);
+		def_attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
+					  ipc_ns->mq_msgsize_default);
+		ret = mq_attr_ok(ipc_ns, &def_attr);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	mode &= ~current_umask();
+	ret = vfs_create(dir, path->dentry, mode, true);
+	path->dentry->d_fsdata = NULL;
+	if (ret)
+		return ERR_PTR(ret);
+	return dentry_open(path, oflag, cred);
+}
+
+/* Opens existing queue */
+static struct file *do_open(struct path *path, int oflag)
+{
+	static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
+						  MAY_READ | MAY_WRITE };
+	int acc;
+	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
+		return ERR_PTR(-EINVAL);
+	acc = oflag2acc[oflag & O_ACCMODE];
+	if (inode_permission(d_inode(path->dentry), acc))
+		return ERR_PTR(-EACCES);
+	return dentry_open(path, oflag, current_cred());
+}
+
+SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
+		struct mq_attr __user *, u_attr)
+{
+	struct path path;
+	struct file *filp;
+	struct filename *name;
+	struct mq_attr attr;
+	int fd, error;
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	struct vfsmount *mnt = ipc_ns->mq_mnt;
+	struct dentry *root = mnt->mnt_root;
+	int ro;
+
+	if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
+		return -EFAULT;
+
+	audit_mq_open(oflag, mode, u_attr ? &attr : NULL);
+
+	if (IS_ERR(name = getname(u_name)))
+		return PTR_ERR(name);
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		goto out_putname;
+
+	ro = mnt_want_write(mnt);	/* we'll drop it in any case */
+	error = 0;
+	mutex_lock(&d_inode(root)->i_mutex);
+	path.dentry = lookup_one_len(name->name, root, strlen(name->name));
+	if (IS_ERR(path.dentry)) {
+		error = PTR_ERR(path.dentry);
+		goto out_putfd;
+	}
+	path.mnt = mntget(mnt);
+
+	if (oflag & O_CREAT) {
+		if (d_really_is_positive(path.dentry)) {	/* entry already exists */
+			audit_inode(name, path.dentry, 0);
+			if (oflag & O_EXCL) {
+				error = -EEXIST;
+				goto out;
+			}
+			filp = do_open(&path, oflag);
+		} else {
+			if (ro) {
+				error = ro;
+				goto out;
+			}
+			audit_inode_parent_hidden(name, root);
+			filp = do_create(ipc_ns, d_inode(root),
+						&path, oflag, mode,
+						u_attr ? &attr : NULL);
+		}
+	} else {
+		if (d_really_is_negative(path.dentry)) {
+			error = -ENOENT;
+			goto out;
+		}
+		audit_inode(name, path.dentry, 0);
+		filp = do_open(&path, oflag);
+	}
+
+	if (!IS_ERR(filp))
+		fd_install(fd, filp);
+	else
+		error = PTR_ERR(filp);
+out:
+	path_put(&path);
+out_putfd:
+	if (error) {
+		put_unused_fd(fd);
+		fd = error;
+	}
+	mutex_unlock(&d_inode(root)->i_mutex);
+	if (!ro)
+		mnt_drop_write(mnt);
+out_putname:
+	putname(name);
+	return fd;
+}
+
+SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
+{
+	int err;
+	struct filename *name;
+	struct dentry *dentry;
+	struct inode *inode = NULL;
+	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+	struct vfsmount *mnt = ipc_ns->mq_mnt;
+
+	name = getname(u_name);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	audit_inode_parent_hidden(name, mnt->mnt_root);
+	err = mnt_want_write(mnt);
+	if (err)
+		goto out_name;
+	mutex_lock_nested(&d_inode(mnt->mnt_root)->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name->name, mnt->mnt_root,
+				strlen(name->name));
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+
+	inode = d_inode(dentry);
+	if (!inode) {
+		err = -ENOENT;
+	} else {
+		ihold(inode);
+		err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
+	}
+	dput(dentry);
+
+out_unlock:
+	mutex_unlock(&d_inode(mnt->mnt_root)->i_mutex);
+	if (inode)
+		iput(inode);
+	mnt_drop_write(mnt);
+out_name:
+	putname(name);
+
+	return err;
+}
+
+/* Pipelined send and receive functions.
+ *
+ * If a receiver finds no waiting message, then it registers itself in the
+ * list of waiting receivers. A sender checks that list before adding the new
+ * message into the message array. If there is a waiting receiver, then it
+ * bypasses the message array and directly hands the message over to the
+ * receiver.
+ * The receiver accepts the message and returns without grabbing the queue
+ * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
+ * are necessary. The same algorithm is used for sysv semaphores, see
+ * ipc/sem.c for more details.
+ *
+ * The same algorithm is used for senders.
+ */
+
+/* pipelined_send() - send a message directly to the task waiting in
+ * sys_mq_timedreceive() (without inserting message into a queue).
+ */
+static inline void pipelined_send(struct mqueue_inode_info *info,
+				  struct msg_msg *message,
+				  struct ext_wait_queue *receiver)
+{
+	receiver->msg = message;
+	list_del(&receiver->list);
+	receiver->state = STATE_PENDING;
+	wake_up_process(receiver->task);
+	smp_wmb();
+	receiver->state = STATE_READY;
+}
+
+/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
+ * gets its message and put to the queue (we have one free place for sure). */
+static inline void pipelined_receive(struct mqueue_inode_info *info)
+{
+	struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
+
+	if (!sender) {
+		/* for poll */
+		wake_up_interruptible(&info->wait_q);
+		return;
+	}
+	if (msg_insert(sender->msg, info))
+		return;
+	list_del(&sender->list);
+	sender->state = STATE_PENDING;
+	wake_up_process(sender->task);
+	smp_wmb();
+	sender->state = STATE_READY;
+}
+
+SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
+		size_t, msg_len, unsigned int, msg_prio,
+		const struct timespec __user *, u_abs_timeout)
+{
+	struct fd f;
+	struct inode *inode;
+	struct ext_wait_queue wait;
+	struct ext_wait_queue *receiver;
+	struct msg_msg *msg_ptr;
+	struct mqueue_inode_info *info;
+	ktime_t expires, *timeout = NULL;
+	struct timespec ts;
+	struct posix_msg_tree_node *new_leaf = NULL;
+	int ret = 0;
+
+	if (u_abs_timeout) {
+		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
+		if (res)
+			return res;
+		timeout = &expires;
+	}
+
+	if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
+		return -EINVAL;
+
+	audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL);
+
+	f = fdget(mqdes);
+	if (unlikely(!f.file)) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	inode = file_inode(f.file);
+	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+		ret = -EBADF;
+		goto out_fput;
+	}
+	info = MQUEUE_I(inode);
+	audit_file(f.file);
+
+	if (unlikely(!(f.file->f_mode & FMODE_WRITE))) {
+		ret = -EBADF;
+		goto out_fput;
+	}
+
+	if (unlikely(msg_len > info->attr.mq_msgsize)) {
+		ret = -EMSGSIZE;
+		goto out_fput;
+	}
+
+	/* First try to allocate memory, before doing anything with
+	 * existing queues. */
+	msg_ptr = load_msg(u_msg_ptr, msg_len);
+	if (IS_ERR(msg_ptr)) {
+		ret = PTR_ERR(msg_ptr);
+		goto out_fput;
+	}
+	msg_ptr->m_ts = msg_len;
+	msg_ptr->m_type = msg_prio;
+
+	/*
+	 * msg_insert really wants us to have a valid, spare node struct so
+	 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
+	 * fall back to that if necessary.
+	 */
+	if (!info->node_cache)
+		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
+
+	spin_lock(&info->lock);
+
+	if (!info->node_cache && new_leaf) {
+		/* Save our speculative allocation into the cache */
+		INIT_LIST_HEAD(&new_leaf->msg_list);
+		info->node_cache = new_leaf;
+		info->qsize += sizeof(*new_leaf);
+		new_leaf = NULL;
+	} else {
+		kfree(new_leaf);
+	}
+
+	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
+		if (f.file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+		} else {
+			wait.task = current;
+			wait.msg = (void *) msg_ptr;
+			wait.state = STATE_NONE;
+			ret = wq_sleep(info, SEND, timeout, &wait);
+			/*
+			 * wq_sleep must be called with info->lock held, and
+			 * returns with the lock released
+			 */
+			goto out_free;
+		}
+	} else {
+		receiver = wq_get_first_waiter(info, RECV);
+		if (receiver) {
+			pipelined_send(info, msg_ptr, receiver);
+		} else {
+			/* adds message to the queue */
+			ret = msg_insert(msg_ptr, info);
+			if (ret)
+				goto out_unlock;
+			__do_notify(info);
+		}
+		inode->i_atime = inode->i_mtime = inode->i_ctime =
+				CURRENT_TIME;
+	}
+out_unlock:
+	spin_unlock(&info->lock);
+out_free:
+	if (ret)
+		free_msg(msg_ptr);
+out_fput:
+	fdput(f);
+out:
+	return ret;
+}
+
+SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
+		size_t, msg_len, unsigned int __user *, u_msg_prio,
+		const struct timespec __user *, u_abs_timeout)
+{
+	ssize_t ret;
+	struct msg_msg *msg_ptr;
+	struct fd f;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+	struct ext_wait_queue wait;
+	ktime_t expires, *timeout = NULL;
+	struct timespec ts;
+	struct posix_msg_tree_node *new_leaf = NULL;
+
+	if (u_abs_timeout) {
+		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
+		if (res)
+			return res;
+		timeout = &expires;
+	}
+
+	audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL);
+
+	f = fdget(mqdes);
+	if (unlikely(!f.file)) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	inode = file_inode(f.file);
+	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+		ret = -EBADF;
+		goto out_fput;
+	}
+	info = MQUEUE_I(inode);
+	audit_file(f.file);
+
+	if (unlikely(!(f.file->f_mode & FMODE_READ))) {
+		ret = -EBADF;
+		goto out_fput;
+	}
+
+	/* checks if buffer is big enough */
+	if (unlikely(msg_len < info->attr.mq_msgsize)) {
+		ret = -EMSGSIZE;
+		goto out_fput;
+	}
+
+	/*
+	 * msg_insert really wants us to have a valid, spare node struct so
+	 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
+	 * fall back to that if necessary.
+	 */
+	if (!info->node_cache)
+		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
+
+	spin_lock(&info->lock);
+
+	if (!info->node_cache && new_leaf) {
+		/* Save our speculative allocation into the cache */
+		INIT_LIST_HEAD(&new_leaf->msg_list);
+		info->node_cache = new_leaf;
+		info->qsize += sizeof(*new_leaf);
+	} else {
+		kfree(new_leaf);
+	}
+
+	if (info->attr.mq_curmsgs == 0) {
+		if (f.file->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			ret = -EAGAIN;
+		} else {
+			wait.task = current;
+			wait.state = STATE_NONE;
+			ret = wq_sleep(info, RECV, timeout, &wait);
+			msg_ptr = wait.msg;
+		}
+	} else {
+		msg_ptr = msg_get(info);
+
+		inode->i_atime = inode->i_mtime = inode->i_ctime =
+				CURRENT_TIME;
+
+		/* There is now free space in queue. */
+		pipelined_receive(info);
+		spin_unlock(&info->lock);
+		ret = 0;
+	}
+	if (ret == 0) {
+		ret = msg_ptr->m_ts;
+
+		if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
+			store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
+			ret = -EFAULT;
+		}
+		free_msg(msg_ptr);
+	}
+out_fput:
+	fdput(f);
+out:
+	return ret;
+}
+
+/*
+ * Notes: the case when user wants us to deregister (with NULL as pointer)
+ * and he isn't currently owner of notification, will be silently discarded.
+ * It isn't explicitly defined in the POSIX.
+ */
+SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
+		const struct sigevent __user *, u_notification)
+{
+	int ret;
+	struct fd f;
+	struct sock *sock;
+	struct inode *inode;
+	struct sigevent notification;
+	struct mqueue_inode_info *info;
+	struct sk_buff *nc;
+
+	if (u_notification) {
+		if (copy_from_user(&notification, u_notification,
+					sizeof(struct sigevent)))
+			return -EFAULT;
+	}
+
+	audit_mq_notify(mqdes, u_notification ? &notification : NULL);
+
+	nc = NULL;
+	sock = NULL;
+	if (u_notification != NULL) {
+		if (unlikely(notification.sigev_notify != SIGEV_NONE &&
+			     notification.sigev_notify != SIGEV_SIGNAL &&
+			     notification.sigev_notify != SIGEV_THREAD))
+			return -EINVAL;
+		if (notification.sigev_notify == SIGEV_SIGNAL &&
+			!valid_signal(notification.sigev_signo)) {
+			return -EINVAL;
+		}
+		if (notification.sigev_notify == SIGEV_THREAD) {
+			long timeo;
+
+			/* create the notify skb */
+			nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
+			if (!nc) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (copy_from_user(nc->data,
+					notification.sigev_value.sival_ptr,
+					NOTIFY_COOKIE_LEN)) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			/* TODO: add a header? */
+			skb_put(nc, NOTIFY_COOKIE_LEN);
+			/* and attach it to the socket */
+retry:
+			f = fdget(notification.sigev_signo);
+			if (!f.file) {
+				ret = -EBADF;
+				goto out;
+			}
+			sock = netlink_getsockbyfilp(f.file);
+			fdput(f);
+			if (IS_ERR(sock)) {
+				ret = PTR_ERR(sock);
+				sock = NULL;
+				goto out;
+			}
+
+			timeo = MAX_SCHEDULE_TIMEOUT;
+			ret = netlink_attachskb(sock, nc, &timeo, NULL);
+			if (ret == 1)
+				goto retry;
+			if (ret) {
+				sock = NULL;
+				nc = NULL;
+				goto out;
+			}
+		}
+	}
+
+	f = fdget(mqdes);
+	if (!f.file) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	inode = file_inode(f.file);
+	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+		ret = -EBADF;
+		goto out_fput;
+	}
+	info = MQUEUE_I(inode);
+
+	ret = 0;
+	spin_lock(&info->lock);
+	if (u_notification == NULL) {
+		if (info->notify_owner == task_tgid(current)) {
+			remove_notification(info);
+			inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		}
+	} else if (info->notify_owner != NULL) {
+		ret = -EBUSY;
+	} else {
+		switch (notification.sigev_notify) {
+		case SIGEV_NONE:
+			info->notify.sigev_notify = SIGEV_NONE;
+			break;
+		case SIGEV_THREAD:
+			info->notify_sock = sock;
+			info->notify_cookie = nc;
+			sock = NULL;
+			nc = NULL;
+			info->notify.sigev_notify = SIGEV_THREAD;
+			break;
+		case SIGEV_SIGNAL:
+			info->notify.sigev_signo = notification.sigev_signo;
+			info->notify.sigev_value = notification.sigev_value;
+			info->notify.sigev_notify = SIGEV_SIGNAL;
+			break;
+		}
+
+		info->notify_owner = get_pid(task_tgid(current));
+		info->notify_user_ns = get_user_ns(current_user_ns());
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}
+	spin_unlock(&info->lock);
+out_fput:
+	fdput(f);
+out:
+	if (sock)
+		netlink_detachskb(sock, nc);
+	else if (nc)
+		dev_kfree_skb(nc);
+
+	return ret;
+}
+
+SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
+		const struct mq_attr __user *, u_mqstat,
+		struct mq_attr __user *, u_omqstat)
+{
+	int ret;
+	struct mq_attr mqstat, omqstat;
+	struct fd f;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+
+	if (u_mqstat != NULL) {
+		if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
+			return -EFAULT;
+		if (mqstat.mq_flags & (~O_NONBLOCK))
+			return -EINVAL;
+	}
+
+	f = fdget(mqdes);
+	if (!f.file) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	inode = file_inode(f.file);
+	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+		ret = -EBADF;
+		goto out_fput;
+	}
+	info = MQUEUE_I(inode);
+
+	spin_lock(&info->lock);
+
+	omqstat = info->attr;
+	omqstat.mq_flags = f.file->f_flags & O_NONBLOCK;
+	if (u_mqstat) {
+		audit_mq_getsetattr(mqdes, &mqstat);
+		spin_lock(&f.file->f_lock);
+		if (mqstat.mq_flags & O_NONBLOCK)
+			f.file->f_flags |= O_NONBLOCK;
+		else
+			f.file->f_flags &= ~O_NONBLOCK;
+		spin_unlock(&f.file->f_lock);
+
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}
+
+	spin_unlock(&info->lock);
+
+	ret = 0;
+	if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat,
+						sizeof(struct mq_attr)))
+		ret = -EFAULT;
+
+out_fput:
+	fdput(f);
+out:
+	return ret;
+}
+
+static const struct inode_operations mqueue_dir_inode_operations = {
+	.lookup = simple_lookup,
+	.create = mqueue_create,
+	.unlink = mqueue_unlink,
+};
+
+static const struct file_operations mqueue_file_operations = {
+	.flush = mqueue_flush_file,
+	.poll = mqueue_poll_file,
+	.read = mqueue_read_file,
+	.llseek = default_llseek,
+};
+
+static const struct super_operations mqueue_super_ops = {
+	.alloc_inode = mqueue_alloc_inode,
+	.destroy_inode = mqueue_destroy_inode,
+	.evict_inode = mqueue_evict_inode,
+	.statfs = simple_statfs,
+};
+
+static struct file_system_type mqueue_fs_type = {
+	.name = "mqueue",
+	.mount = mqueue_mount,
+	.kill_sb = kill_litter_super,
+	.fs_flags = FS_USERNS_MOUNT,
+};
+
+int mq_init_ns(struct ipc_namespace *ns)
+{
+	ns->mq_queues_count  = 0;
+	ns->mq_queues_max    = DFLT_QUEUESMAX;
+	ns->mq_msg_max       = DFLT_MSGMAX;
+	ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
+	ns->mq_msg_default   = DFLT_MSG;
+	ns->mq_msgsize_default  = DFLT_MSGSIZE;
+
+	ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
+	if (IS_ERR(ns->mq_mnt)) {
+		int err = PTR_ERR(ns->mq_mnt);
+		ns->mq_mnt = NULL;
+		return err;
+	}
+	return 0;
+}
+
+void mq_clear_sbinfo(struct ipc_namespace *ns)
+{
+	ns->mq_mnt->mnt_sb->s_fs_info = NULL;
+}
+
+void mq_put_mnt(struct ipc_namespace *ns)
+{
+	kern_unmount(ns->mq_mnt);
+}
+
+static int __init init_mqueue_fs(void)
+{
+	int error;
+
+	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
+				sizeof(struct mqueue_inode_info), 0,
+				SLAB_HWCACHE_ALIGN, init_once);
+	if (mqueue_inode_cachep == NULL)
+		return -ENOMEM;
+
+	/* ignore failures - they are not fatal */
+	mq_sysctl_table = mq_register_sysctl_table();
+
+	error = register_filesystem(&mqueue_fs_type);
+	if (error)
+		goto out_sysctl;
+
+	spin_lock_init(&mq_lock);
+
+	error = mq_init_ns(&init_ipc_ns);
+	if (error)
+		goto out_filesystem;
+
+	return 0;
+
+out_filesystem:
+	unregister_filesystem(&mqueue_fs_type);
+out_sysctl:
+	if (mq_sysctl_table)
+		unregister_sysctl_table(mq_sysctl_table);
+	kmem_cache_destroy(mqueue_inode_cachep);
+	return error;
+}
+
+device_initcall(init_mqueue_fs);
diff --git a/ipc/msg.c b/ipc/msg.c
new file mode 100644
index 000000000..2b6fdbb9e
--- /dev/null
+++ b/ipc/msg.c
@@ -0,0 +1,1046 @@
+/*
+ * linux/ipc/msg.c
+ * Copyright (C) 1992 Krishna Balasubramanian
+ *
+ * Removed all the remaining kerneld mess
+ * Catch the -EFAULT stuff properly
+ * Use GFP_KERNEL for messages as in 1.2
+ * Fixed up the unchecked user space derefs
+ * Copyright (C) 1998 Alan Cox & Andi Kleen
+ *
+ * /proc/sysvipc/msg support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
+ *
+ * mostly rewritten, threaded and wake-one semantics added
+ * MSGMAX limit removed, sysctl's added
+ * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
+ *
+ * support for audit of ipc object properties and permission changes
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
+ *
+ * namespaces support
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ */
+
+#include <linux/capability.h>
+#include <linux/msg.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/list.h>
+#include <linux/security.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/seq_file.h>
+#include <linux/rwsem.h>
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+
+#include <asm/current.h>
+#include <linux/uaccess.h>
+#include "util.h"
+
+/* one msg_receiver structure for each sleeping receiver */
+struct msg_receiver {
+	struct list_head	r_list;
+	struct task_struct	*r_tsk;
+
+	int			r_mode;
+	long			r_msgtype;
+	long			r_maxsize;
+
+	/*
+	 * Mark r_msg volatile so that the compiler
+	 * does not try to get smart and optimize
+	 * it. We rely on this for the lockless
+	 * receive algorithm.
+	 */
+	struct msg_msg		*volatile r_msg;
+};
+
+/* one msg_sender for each sleeping sender */
+struct msg_sender {
+	struct list_head	list;
+	struct task_struct	*tsk;
+};
+
+#define SEARCH_ANY		1
+#define SEARCH_EQUAL		2
+#define SEARCH_NOTEQUAL		3
+#define SEARCH_LESSEQUAL	4
+#define SEARCH_NUMBER		5
+
+#define msg_ids(ns)	((ns)->ids[IPC_MSG_IDS])
+
+static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct msg_queue, q_perm);
+}
+
+static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns,
+							int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct msg_queue, q_perm);
+}
+
+static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s)
+{
+	ipc_rmid(&msg_ids(ns), &s->q_perm);
+}
+
+static void msg_rcu_free(struct rcu_head *head)
+{
+	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+	struct msg_queue *msq = ipc_rcu_to_struct(p);
+
+	security_msg_queue_free(msq);
+	ipc_rcu_free(head);
+}
+
+/**
+ * newque - Create a new msg queue
+ * @ns: namespace
+ * @params: ptr to the structure that contains the key and msgflg
+ *
+ * Called with msg_ids.rwsem held (writer)
+ */
+static int newque(struct ipc_namespace *ns, struct ipc_params *params)
+{
+	struct msg_queue *msq;
+	int id, retval;
+	key_t key = params->key;
+	int msgflg = params->flg;
+
+	msq = ipc_rcu_alloc(sizeof(*msq));
+	if (!msq)
+		return -ENOMEM;
+
+	msq->q_perm.mode = msgflg & S_IRWXUGO;
+	msq->q_perm.key = key;
+
+	msq->q_perm.security = NULL;
+	retval = security_msg_queue_alloc(msq);
+	if (retval) {
+		ipc_rcu_putref(msq, ipc_rcu_free);
+		return retval;
+	}
+
+	/* ipc_addid() locks msq upon success. */
+	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
+	if (id < 0) {
+		ipc_rcu_putref(msq, msg_rcu_free);
+		return id;
+	}
+
+	msq->q_stime = msq->q_rtime = 0;
+	msq->q_ctime = get_seconds();
+	msq->q_cbytes = msq->q_qnum = 0;
+	msq->q_qbytes = ns->msg_ctlmnb;
+	msq->q_lspid = msq->q_lrpid = 0;
+	INIT_LIST_HEAD(&msq->q_messages);
+	INIT_LIST_HEAD(&msq->q_receivers);
+	INIT_LIST_HEAD(&msq->q_senders);
+
+	ipc_unlock_object(&msq->q_perm);
+	rcu_read_unlock();
+
+	return msq->q_perm.id;
+}
+
+static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss)
+{
+	mss->tsk = current;
+	__set_current_state(TASK_INTERRUPTIBLE);
+	list_add_tail(&mss->list, &msq->q_senders);
+}
+
+static inline void ss_del(struct msg_sender *mss)
+{
+	if (mss->list.next != NULL)
+		list_del(&mss->list);
+}
+
+static void ss_wakeup(struct list_head *h, int kill)
+{
+	struct msg_sender *mss, *t;
+
+	list_for_each_entry_safe(mss, t, h, list) {
+		if (kill)
+			mss->list.next = NULL;
+		wake_up_process(mss->tsk);
+	}
+}
+
+static void expunge_all(struct msg_queue *msq, int res)
+{
+	struct msg_receiver *msr, *t;
+
+	list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
+		msr->r_msg = NULL; /* initialize expunge ordering */
+		wake_up_process(msr->r_tsk);
+		/*
+		 * Ensure that the wakeup is visible before setting r_msg as
+		 * the receiving end depends on it: either spinning on a nil,
+		 * or dealing with -EAGAIN cases. See lockless receive part 1
+		 * and 2 in do_msgrcv().
+		 */
+		smp_mb();
+		msr->r_msg = ERR_PTR(res);
+	}
+}
+
+/*
+ * freeque() wakes up waiters on the sender and receiver waiting queue,
+ * removes the message queue from message queue ID IDR, and cleans up all the
+ * messages associated with this queue.
+ *
+ * msg_ids.rwsem (writer) and the spinlock for this message queue are held
+ * before freeque() is called. msg_ids.rwsem remains locked on exit.
+ */
+static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+{
+	struct msg_msg *msg, *t;
+	struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
+
+	expunge_all(msq, -EIDRM);
+	ss_wakeup(&msq->q_senders, 1);
+	msg_rmid(ns, msq);
+	ipc_unlock_object(&msq->q_perm);
+	rcu_read_unlock();
+
+	list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
+		atomic_dec(&ns->msg_hdrs);
+		free_msg(msg);
+	}
+	atomic_sub(msq->q_cbytes, &ns->msg_bytes);
+	ipc_rcu_putref(msq, msg_rcu_free);
+}
+
+/*
+ * Called with msg_ids.rwsem and ipcp locked.
+ */
+static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg)
+{
+	struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
+
+	return security_msg_queue_associate(msq, msgflg);
+}
+
+SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
+{
+	struct ipc_namespace *ns;
+	static const struct ipc_ops msg_ops = {
+		.getnew = newque,
+		.associate = msg_security,
+	};
+	struct ipc_params msg_params;
+
+	ns = current->nsproxy->ipc_ns;
+
+	msg_params.key = key;
+	msg_params.flg = msgflg;
+
+	return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
+}
+
+static inline unsigned long
+copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version)
+{
+	switch (version) {
+	case IPC_64:
+		return copy_to_user(buf, in, sizeof(*in));
+	case IPC_OLD:
+	{
+		struct msqid_ds out;
+
+		memset(&out, 0, sizeof(out));
+
+		ipc64_perm_to_ipc_perm(&in->msg_perm, &out.msg_perm);
+
+		out.msg_stime		= in->msg_stime;
+		out.msg_rtime		= in->msg_rtime;
+		out.msg_ctime		= in->msg_ctime;
+
+		if (in->msg_cbytes > USHRT_MAX)
+			out.msg_cbytes	= USHRT_MAX;
+		else
+			out.msg_cbytes	= in->msg_cbytes;
+		out.msg_lcbytes		= in->msg_cbytes;
+
+		if (in->msg_qnum > USHRT_MAX)
+			out.msg_qnum	= USHRT_MAX;
+		else
+			out.msg_qnum	= in->msg_qnum;
+
+		if (in->msg_qbytes > USHRT_MAX)
+			out.msg_qbytes	= USHRT_MAX;
+		else
+			out.msg_qbytes	= in->msg_qbytes;
+		out.msg_lqbytes		= in->msg_qbytes;
+
+		out.msg_lspid		= in->msg_lspid;
+		out.msg_lrpid		= in->msg_lrpid;
+
+		return copy_to_user(buf, &out, sizeof(out));
+	}
+	default:
+		return -EINVAL;
+	}
+}
+
+static inline unsigned long
+copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)
+{
+	switch (version) {
+	case IPC_64:
+		if (copy_from_user(out, buf, sizeof(*out)))
+			return -EFAULT;
+		return 0;
+	case IPC_OLD:
+	{
+		struct msqid_ds tbuf_old;
+
+		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
+			return -EFAULT;
+
+		out->msg_perm.uid	= tbuf_old.msg_perm.uid;
+		out->msg_perm.gid	= tbuf_old.msg_perm.gid;
+		out->msg_perm.mode	= tbuf_old.msg_perm.mode;
+
+		if (tbuf_old.msg_qbytes == 0)
+			out->msg_qbytes	= tbuf_old.msg_lqbytes;
+		else
+			out->msg_qbytes	= tbuf_old.msg_qbytes;
+
+		return 0;
+	}
+	default:
+		return -EINVAL;
+	}
+}
+
+/*
+ * This function handles some msgctl commands which require the rwsem
+ * to be held in write mode.
+ * NOTE: no locks must be held, the rwsem is taken inside this function.
+ */
+static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
+		       struct msqid_ds __user *buf, int version)
+{
+	struct kern_ipc_perm *ipcp;
+	struct msqid64_ds uninitialized_var(msqid64);
+	struct msg_queue *msq;
+	int err;
+
+	if (cmd == IPC_SET) {
+		if (copy_msqid_from_user(&msqid64, buf, version))
+			return -EFAULT;
+	}
+
+	down_write(&msg_ids(ns).rwsem);
+	rcu_read_lock();
+
+	ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd,
+				      &msqid64.msg_perm, msqid64.msg_qbytes);
+	if (IS_ERR(ipcp)) {
+		err = PTR_ERR(ipcp);
+		goto out_unlock1;
+	}
+
+	msq = container_of(ipcp, struct msg_queue, q_perm);
+
+	err = security_msg_queue_msgctl(msq, cmd);
+	if (err)
+		goto out_unlock1;
+
+	switch (cmd) {
+	case IPC_RMID:
+		ipc_lock_object(&msq->q_perm);
+		/* freeque unlocks the ipc object and rcu */
+		freeque(ns, ipcp);
+		goto out_up;
+	case IPC_SET:
+		if (msqid64.msg_qbytes > ns->msg_ctlmnb &&
+		    !capable(CAP_SYS_RESOURCE)) {
+			err = -EPERM;
+			goto out_unlock1;
+		}
+
+		ipc_lock_object(&msq->q_perm);
+		err = ipc_update_perm(&msqid64.msg_perm, ipcp);
+		if (err)
+			goto out_unlock0;
+
+		msq->q_qbytes = msqid64.msg_qbytes;
+
+		msq->q_ctime = get_seconds();
+		/* sleeping receivers might be excluded by
+		 * stricter permissions.
+		 */
+		expunge_all(msq, -EAGAIN);
+		/* sleeping senders might be able to send
+		 * due to a larger queue size.
+		 */
+		ss_wakeup(&msq->q_senders, 0);
+		break;
+	default:
+		err = -EINVAL;
+		goto out_unlock1;
+	}
+
+out_unlock0:
+	ipc_unlock_object(&msq->q_perm);
+out_unlock1:
+	rcu_read_unlock();
+out_up:
+	up_write(&msg_ids(ns).rwsem);
+	return err;
+}
+
+static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
+			 int cmd, int version, void __user *buf)
+{
+	int err;
+	struct msg_queue *msq;
+
+	switch (cmd) {
+	case IPC_INFO:
+	case MSG_INFO:
+	{
+		struct msginfo msginfo;
+		int max_id;
+
+		if (!buf)
+			return -EFAULT;
+
+		/*
+		 * We must not return kernel stack data.
+		 * due to padding, it's not enough
+		 * to set all member fields.
+		 */
+		err = security_msg_queue_msgctl(NULL, cmd);
+		if (err)
+			return err;
+
+		memset(&msginfo, 0, sizeof(msginfo));
+		msginfo.msgmni = ns->msg_ctlmni;
+		msginfo.msgmax = ns->msg_ctlmax;
+		msginfo.msgmnb = ns->msg_ctlmnb;
+		msginfo.msgssz = MSGSSZ;
+		msginfo.msgseg = MSGSEG;
+		down_read(&msg_ids(ns).rwsem);
+		if (cmd == MSG_INFO) {
+			msginfo.msgpool = msg_ids(ns).in_use;
+			msginfo.msgmap = atomic_read(&ns->msg_hdrs);
+			msginfo.msgtql = atomic_read(&ns->msg_bytes);
+		} else {
+			msginfo.msgmap = MSGMAP;
+			msginfo.msgpool = MSGPOOL;
+			msginfo.msgtql = MSGTQL;
+		}
+		max_id = ipc_get_maxid(&msg_ids(ns));
+		up_read(&msg_ids(ns).rwsem);
+		if (copy_to_user(buf, &msginfo, sizeof(struct msginfo)))
+			return -EFAULT;
+		return (max_id < 0) ? 0 : max_id;
+	}
+
+	case MSG_STAT:
+	case IPC_STAT:
+	{
+		struct msqid64_ds tbuf;
+		int success_return;
+
+		if (!buf)
+			return -EFAULT;
+
+		memset(&tbuf, 0, sizeof(tbuf));
+
+		rcu_read_lock();
+		if (cmd == MSG_STAT) {
+			msq = msq_obtain_object(ns, msqid);
+			if (IS_ERR(msq)) {
+				err = PTR_ERR(msq);
+				goto out_unlock;
+			}
+			success_return = msq->q_perm.id;
+		} else {
+			msq = msq_obtain_object_check(ns, msqid);
+			if (IS_ERR(msq)) {
+				err = PTR_ERR(msq);
+				goto out_unlock;
+			}
+			success_return = 0;
+		}
+
+		err = -EACCES;
+		if (ipcperms(ns, &msq->q_perm, S_IRUGO))
+			goto out_unlock;
+
+		err = security_msg_queue_msgctl(msq, cmd);
+		if (err)
+			goto out_unlock;
+
+		kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm);
+		tbuf.msg_stime  = msq->q_stime;
+		tbuf.msg_rtime  = msq->q_rtime;
+		tbuf.msg_ctime  = msq->q_ctime;
+		tbuf.msg_cbytes = msq->q_cbytes;
+		tbuf.msg_qnum   = msq->q_qnum;
+		tbuf.msg_qbytes = msq->q_qbytes;
+		tbuf.msg_lspid  = msq->q_lspid;
+		tbuf.msg_lrpid  = msq->q_lrpid;
+		rcu_read_unlock();
+
+		if (copy_msqid_to_user(buf, &tbuf, version))
+			return -EFAULT;
+		return success_return;
+	}
+
+	default:
+		return -EINVAL;
+	}
+
+	return err;
+out_unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)
+{
+	int version;
+	struct ipc_namespace *ns;
+
+	if (msqid < 0 || cmd < 0)
+		return -EINVAL;
+
+	version = ipc_parse_version(&cmd);
+	ns = current->nsproxy->ipc_ns;
+
+	switch (cmd) {
+	case IPC_INFO:
+	case MSG_INFO:
+	case MSG_STAT:	/* msqid is an index rather than a msg queue id */
+	case IPC_STAT:
+		return msgctl_nolock(ns, msqid, cmd, version, buf);
+	case IPC_SET:
+	case IPC_RMID:
+		return msgctl_down(ns, msqid, cmd, buf, version);
+	default:
+		return  -EINVAL;
+	}
+}
+
+static int testmsg(struct msg_msg *msg, long type, int mode)
+{
+	switch (mode) {
+	case SEARCH_ANY:
+	case SEARCH_NUMBER:
+		return 1;
+	case SEARCH_LESSEQUAL:
+		if (msg->m_type <= type)
+			return 1;
+		break;
+	case SEARCH_EQUAL:
+		if (msg->m_type == type)
+			return 1;
+		break;
+	case SEARCH_NOTEQUAL:
+		if (msg->m_type != type)
+			return 1;
+		break;
+	}
+	return 0;
+}
+
+static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
+{
+	struct msg_receiver *msr, *t;
+
+	list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
+		if (testmsg(msg, msr->r_msgtype, msr->r_mode) &&
+		    !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
+					       msr->r_msgtype, msr->r_mode)) {
+
+			list_del(&msr->r_list);
+			if (msr->r_maxsize < msg->m_ts) {
+				/* initialize pipelined send ordering */
+				msr->r_msg = NULL;
+				wake_up_process(msr->r_tsk);
+				smp_mb(); /* see barrier comment below */
+				msr->r_msg = ERR_PTR(-E2BIG);
+			} else {
+				msr->r_msg = NULL;
+				msq->q_lrpid = task_pid_vnr(msr->r_tsk);
+				msq->q_rtime = get_seconds();
+				wake_up_process(msr->r_tsk);
+				/*
+				 * Ensure that the wakeup is visible before
+				 * setting r_msg, as the receiving end depends
+				 * on it. See lockless receive part 1 and 2 in
+				 * do_msgrcv().
+				 */
+				smp_mb();
+				msr->r_msg = msg;
+
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
+long do_msgsnd(int msqid, long mtype, void __user *mtext,
+		size_t msgsz, int msgflg)
+{
+	struct msg_queue *msq;
+	struct msg_msg *msg;
+	int err;
+	struct ipc_namespace *ns;
+
+	ns = current->nsproxy->ipc_ns;
+
+	if (msgsz > ns->msg_ctlmax || (long) msgsz < 0 || msqid < 0)
+		return -EINVAL;
+	if (mtype < 1)
+		return -EINVAL;
+
+	msg = load_msg(mtext, msgsz);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->m_type = mtype;
+	msg->m_ts = msgsz;
+
+	rcu_read_lock();
+	msq = msq_obtain_object_check(ns, msqid);
+	if (IS_ERR(msq)) {
+		err = PTR_ERR(msq);
+		goto out_unlock1;
+	}
+
+	ipc_lock_object(&msq->q_perm);
+
+	for (;;) {
+		struct msg_sender s;
+
+		err = -EACCES;
+		if (ipcperms(ns, &msq->q_perm, S_IWUGO))
+			goto out_unlock0;
+
+		/* raced with RMID? */
+		if (!ipc_valid_object(&msq->q_perm)) {
+			err = -EIDRM;
+			goto out_unlock0;
+		}
+
+		err = security_msg_queue_msgsnd(msq, msg, msgflg);
+		if (err)
+			goto out_unlock0;
+
+		if (msgsz + msq->q_cbytes <= msq->q_qbytes &&
+				1 + msq->q_qnum <= msq->q_qbytes) {
+			break;
+		}
+
+		/* queue full, wait: */
+		if (msgflg & IPC_NOWAIT) {
+			err = -EAGAIN;
+			goto out_unlock0;
+		}
+
+		/* enqueue the sender and prepare to block */
+		ss_add(msq, &s);
+
+		if (!ipc_rcu_getref(msq)) {
+			err = -EIDRM;
+			goto out_unlock0;
+		}
+
+		ipc_unlock_object(&msq->q_perm);
+		rcu_read_unlock();
+		schedule();
+
+		rcu_read_lock();
+		ipc_lock_object(&msq->q_perm);
+
+		ipc_rcu_putref(msq, ipc_rcu_free);
+		/* raced with RMID? */
+		if (!ipc_valid_object(&msq->q_perm)) {
+			err = -EIDRM;
+			goto out_unlock0;
+		}
+
+		ss_del(&s);
+
+		if (signal_pending(current)) {
+			err = -ERESTARTNOHAND;
+			goto out_unlock0;
+		}
+
+	}
+	msq->q_lspid = task_tgid_vnr(current);
+	msq->q_stime = get_seconds();
+
+	if (!pipelined_send(msq, msg)) {
+		/* no one is waiting for this message, enqueue it */
+		list_add_tail(&msg->m_list, &msq->q_messages);
+		msq->q_cbytes += msgsz;
+		msq->q_qnum++;
+		atomic_add(msgsz, &ns->msg_bytes);
+		atomic_inc(&ns->msg_hdrs);
+	}
+
+	err = 0;
+	msg = NULL;
+
+out_unlock0:
+	ipc_unlock_object(&msq->q_perm);
+out_unlock1:
+	rcu_read_unlock();
+	if (msg != NULL)
+		free_msg(msg);
+	return err;
+}
+
+SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
+		int, msgflg)
+{
+	long mtype;
+
+	if (get_user(mtype, &msgp->mtype))
+		return -EFAULT;
+	return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg);
+}
+
+static inline int convert_mode(long *msgtyp, int msgflg)
+{
+	if (msgflg & MSG_COPY)
+		return SEARCH_NUMBER;
+	/*
+	 *  find message of correct type.
+	 *  msgtyp = 0 => get first.
+	 *  msgtyp > 0 => get first message of matching type.
+	 *  msgtyp < 0 => get message with least type must be < abs(msgtype).
+	 */
+	if (*msgtyp == 0)
+		return SEARCH_ANY;
+	if (*msgtyp < 0) {
+		*msgtyp = -*msgtyp;
+		return SEARCH_LESSEQUAL;
+	}
+	if (msgflg & MSG_EXCEPT)
+		return SEARCH_NOTEQUAL;
+	return SEARCH_EQUAL;
+}
+
+static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
+{
+	struct msgbuf __user *msgp = dest;
+	size_t msgsz;
+
+	if (put_user(msg->m_type, &msgp->mtype))
+		return -EFAULT;
+
+	msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz;
+	if (store_msg(msgp->mtext, msg, msgsz))
+		return -EFAULT;
+	return msgsz;
+}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * This function creates new kernel message structure, large enough to store
+ * bufsz message bytes.
+ */
+static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
+{
+	struct msg_msg *copy;
+
+	/*
+	 * Create dummy message to copy real message to.
+	 */
+	copy = load_msg(buf, bufsz);
+	if (!IS_ERR(copy))
+		copy->m_ts = bufsz;
+	return copy;
+}
+
+static inline void free_copy(struct msg_msg *copy)
+{
+	if (copy)
+		free_msg(copy);
+}
+#else
+static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline void free_copy(struct msg_msg *copy)
+{
+}
+#endif
+
+static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode)
+{
+	struct msg_msg *msg, *found = NULL;
+	long count = 0;
+
+	list_for_each_entry(msg, &msq->q_messages, m_list) {
+		if (testmsg(msg, *msgtyp, mode) &&
+		    !security_msg_queue_msgrcv(msq, msg, current,
+					       *msgtyp, mode)) {
+			if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) {
+				*msgtyp = msg->m_type - 1;
+				found = msg;
+			} else if (mode == SEARCH_NUMBER) {
+				if (*msgtyp == count)
+					return msg;
+			} else
+				return msg;
+			count++;
+		}
+	}
+
+	return found ?: ERR_PTR(-EAGAIN);
+}
+
+long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg,
+	       long (*msg_handler)(void __user *, struct msg_msg *, size_t))
+{
+	int mode;
+	struct msg_queue *msq;
+	struct ipc_namespace *ns;
+	struct msg_msg *msg, *copy = NULL;
+
+	ns = current->nsproxy->ipc_ns;
+
+	if (msqid < 0 || (long) bufsz < 0)
+		return -EINVAL;
+
+	if (msgflg & MSG_COPY) {
+		if ((msgflg & MSG_EXCEPT) || !(msgflg & IPC_NOWAIT))
+			return -EINVAL;
+		copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax));
+		if (IS_ERR(copy))
+			return PTR_ERR(copy);
+	}
+	mode = convert_mode(&msgtyp, msgflg);
+
+	rcu_read_lock();
+	msq = msq_obtain_object_check(ns, msqid);
+	if (IS_ERR(msq)) {
+		rcu_read_unlock();
+		free_copy(copy);
+		return PTR_ERR(msq);
+	}
+
+	for (;;) {
+		struct msg_receiver msr_d;
+
+		msg = ERR_PTR(-EACCES);
+		if (ipcperms(ns, &msq->q_perm, S_IRUGO))
+			goto out_unlock1;
+
+		ipc_lock_object(&msq->q_perm);
+
+		/* raced with RMID? */
+		if (!ipc_valid_object(&msq->q_perm)) {
+			msg = ERR_PTR(-EIDRM);
+			goto out_unlock0;
+		}
+
+		msg = find_msg(msq, &msgtyp, mode);
+		if (!IS_ERR(msg)) {
+			/*
+			 * Found a suitable message.
+			 * Unlink it from the queue.
+			 */
+			if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) {
+				msg = ERR_PTR(-E2BIG);
+				goto out_unlock0;
+			}
+			/*
+			 * If we are copying, then do not unlink message and do
+			 * not update queue parameters.
+			 */
+			if (msgflg & MSG_COPY) {
+				msg = copy_msg(msg, copy);
+				goto out_unlock0;
+			}
+
+			list_del(&msg->m_list);
+			msq->q_qnum--;
+			msq->q_rtime = get_seconds();
+			msq->q_lrpid = task_tgid_vnr(current);
+			msq->q_cbytes -= msg->m_ts;
+			atomic_sub(msg->m_ts, &ns->msg_bytes);
+			atomic_dec(&ns->msg_hdrs);
+			ss_wakeup(&msq->q_senders, 0);
+
+			goto out_unlock0;
+		}
+
+		/* No message waiting. Wait for a message */
+		if (msgflg & IPC_NOWAIT) {
+			msg = ERR_PTR(-ENOMSG);
+			goto out_unlock0;
+		}
+
+		list_add_tail(&msr_d.r_list, &msq->q_receivers);
+		msr_d.r_tsk = current;
+		msr_d.r_msgtype = msgtyp;
+		msr_d.r_mode = mode;
+		if (msgflg & MSG_NOERROR)
+			msr_d.r_maxsize = INT_MAX;
+		else
+			msr_d.r_maxsize = bufsz;
+		msr_d.r_msg = ERR_PTR(-EAGAIN);
+		__set_current_state(TASK_INTERRUPTIBLE);
+
+		ipc_unlock_object(&msq->q_perm);
+		rcu_read_unlock();
+		schedule();
+
+		/* Lockless receive, part 1:
+		 * Disable preemption.  We don't hold a reference to the queue
+		 * and getting a reference would defeat the idea of a lockless
+		 * operation, thus the code relies on rcu to guarantee the
+		 * existence of msq:
+		 * Prior to destruction, expunge_all(-EIRDM) changes r_msg.
+		 * Thus if r_msg is -EAGAIN, then the queue not yet destroyed.
+		 * rcu_read_lock() prevents preemption between reading r_msg
+		 * and acquiring the q_perm.lock in ipc_lock_object().
+		 */
+		rcu_read_lock();
+
+		/* Lockless receive, part 2:
+		 * Wait until pipelined_send or expunge_all are outside of
+		 * wake_up_process(). There is a race with exit(), see
+		 * ipc/mqueue.c for the details.
+		 */
+		msg = (struct msg_msg *)msr_d.r_msg;
+		while (msg == NULL) {
+			cpu_relax();
+			msg = (struct msg_msg *)msr_d.r_msg;
+		}
+
+		/* Lockless receive, part 3:
+		 * If there is a message or an error then accept it without
+		 * locking.
+		 */
+		if (msg != ERR_PTR(-EAGAIN))
+			goto out_unlock1;
+
+		/* Lockless receive, part 3:
+		 * Acquire the queue spinlock.
+		 */
+		ipc_lock_object(&msq->q_perm);
+
+		/* Lockless receive, part 4:
+		 * Repeat test after acquiring the spinlock.
+		 */
+		msg = (struct msg_msg *)msr_d.r_msg;
+		if (msg != ERR_PTR(-EAGAIN))
+			goto out_unlock0;
+
+		list_del(&msr_d.r_list);
+		if (signal_pending(current)) {
+			msg = ERR_PTR(-ERESTARTNOHAND);
+			goto out_unlock0;
+		}
+
+		ipc_unlock_object(&msq->q_perm);
+	}
+
+out_unlock0:
+	ipc_unlock_object(&msq->q_perm);
+out_unlock1:
+	rcu_read_unlock();
+	if (IS_ERR(msg)) {
+		free_copy(copy);
+		return PTR_ERR(msg);
+	}
+
+	bufsz = msg_handler(buf, msg, bufsz);
+	free_msg(msg);
+
+	return bufsz;
+}
+
+SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
+		long, msgtyp, int, msgflg)
+{
+	return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill);
+}
+
+
+void msg_init_ns(struct ipc_namespace *ns)
+{
+	ns->msg_ctlmax = MSGMAX;
+	ns->msg_ctlmnb = MSGMNB;
+	ns->msg_ctlmni = MSGMNI;
+
+	atomic_set(&ns->msg_bytes, 0);
+	atomic_set(&ns->msg_hdrs, 0);
+	ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+}
+
+#ifdef CONFIG_IPC_NS
+void msg_exit_ns(struct ipc_namespace *ns)
+{
+	free_ipcs(ns, &msg_ids(ns), freeque);
+	idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
+{
+	struct user_namespace *user_ns = seq_user_ns(s);
+	struct msg_queue *msq = it;
+
+	seq_printf(s,
+		   "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
+		   msq->q_perm.key,
+		   msq->q_perm.id,
+		   msq->q_perm.mode,
+		   msq->q_cbytes,
+		   msq->q_qnum,
+		   msq->q_lspid,
+		   msq->q_lrpid,
+		   from_kuid_munged(user_ns, msq->q_perm.uid),
+		   from_kgid_munged(user_ns, msq->q_perm.gid),
+		   from_kuid_munged(user_ns, msq->q_perm.cuid),
+		   from_kgid_munged(user_ns, msq->q_perm.cgid),
+		   msq->q_stime,
+		   msq->q_rtime,
+		   msq->q_ctime);
+
+	return 0;
+}
+#endif
+
+void __init msg_init(void)
+{
+	msg_init_ns(&init_ipc_ns);
+
+	ipc_init_proc_interface("sysvipc/msg",
+				"       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n",
+				IPC_MSG_IDS, sysvipc_msg_proc_show);
+}
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
new file mode 100644
index 000000000..2b491590e
--- /dev/null
+++ b/ipc/msgutil.c
@@ -0,0 +1,185 @@
+/*
+ * linux/ipc/msgutil.c
+ * Copyright (C) 1999, 2004 Manfred Spraul
+ *
+ * This file is released under GNU General Public Licence version 2 or
+ * (at your option) any later version.
+ *
+ * See the file COPYING for more details.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/ipc.h>
+#include <linux/msg.h>
+#include <linux/ipc_namespace.h>
+#include <linux/utsname.h>
+#include <linux/proc_ns.h>
+#include <linux/uaccess.h>
+
+#include "util.h"
+
+DEFINE_SPINLOCK(mq_lock);
+
+/*
+ * The next 2 defines are here bc this is the only file
+ * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE
+ * and not CONFIG_IPC_NS.
+ */
+struct ipc_namespace init_ipc_ns = {
+	.count		= ATOMIC_INIT(1),
+	.user_ns = &init_user_ns,
+	.ns.inum = PROC_IPC_INIT_INO,
+#ifdef CONFIG_IPC_NS
+	.ns.ops = &ipcns_operations,
+#endif
+};
+
+atomic_t nr_ipc_ns = ATOMIC_INIT(1);
+
+struct msg_msgseg {
+	struct msg_msgseg *next;
+	/* the next part of the message follows immediately */
+};
+
+#define DATALEN_MSG	((size_t)PAGE_SIZE-sizeof(struct msg_msg))
+#define DATALEN_SEG	((size_t)PAGE_SIZE-sizeof(struct msg_msgseg))
+
+
+static struct msg_msg *alloc_msg(size_t len)
+{
+	struct msg_msg *msg;
+	struct msg_msgseg **pseg;
+	size_t alen;
+
+	alen = min(len, DATALEN_MSG);
+	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+	if (msg == NULL)
+		return NULL;
+
+	msg->next = NULL;
+	msg->security = NULL;
+
+	len -= alen;
+	pseg = &msg->next;
+	while (len > 0) {
+		struct msg_msgseg *seg;
+		alen = min(len, DATALEN_SEG);
+		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL);
+		if (seg == NULL)
+			goto out_err;
+		*pseg = seg;
+		seg->next = NULL;
+		pseg = &seg->next;
+		len -= alen;
+	}
+
+	return msg;
+
+out_err:
+	free_msg(msg);
+	return NULL;
+}
+
+struct msg_msg *load_msg(const void __user *src, size_t len)
+{
+	struct msg_msg *msg;
+	struct msg_msgseg *seg;
+	int err = -EFAULT;
+	size_t alen;
+
+	msg = alloc_msg(len);
+	if (msg == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	alen = min(len, DATALEN_MSG);
+	if (copy_from_user(msg + 1, src, alen))
+		goto out_err;
+
+	for (seg = msg->next; seg != NULL; seg = seg->next) {
+		len -= alen;
+		src = (char __user *)src + alen;
+		alen = min(len, DATALEN_SEG);
+		if (copy_from_user(seg + 1, src, alen))
+			goto out_err;
+	}
+
+	err = security_msg_msg_alloc(msg);
+	if (err)
+		goto out_err;
+
+	return msg;
+
+out_err:
+	free_msg(msg);
+	return ERR_PTR(err);
+}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
+{
+	struct msg_msgseg *dst_pseg, *src_pseg;
+	size_t len = src->m_ts;
+	size_t alen;
+
+	BUG_ON(dst == NULL);
+	if (src->m_ts > dst->m_ts)
+		return ERR_PTR(-EINVAL);
+
+	alen = min(len, DATALEN_MSG);
+	memcpy(dst + 1, src + 1, alen);
+
+	for (dst_pseg = dst->next, src_pseg = src->next;
+	     src_pseg != NULL;
+	     dst_pseg = dst_pseg->next, src_pseg = src_pseg->next) {
+
+		len -= alen;
+		alen = min(len, DATALEN_SEG);
+		memcpy(dst_pseg + 1, src_pseg + 1, alen);
+	}
+
+	dst->m_type = src->m_type;
+	dst->m_ts = src->m_ts;
+
+	return dst;
+}
+#else
+struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
+{
+	return ERR_PTR(-ENOSYS);
+}
+#endif
+int store_msg(void __user *dest, struct msg_msg *msg, size_t len)
+{
+	size_t alen;
+	struct msg_msgseg *seg;
+
+	alen = min(len, DATALEN_MSG);
+	if (copy_to_user(dest, msg + 1, alen))
+		return -1;
+
+	for (seg = msg->next; seg != NULL; seg = seg->next) {
+		len -= alen;
+		dest = (char __user *)dest + alen;
+		alen = min(len, DATALEN_SEG);
+		if (copy_to_user(dest, seg + 1, alen))
+			return -1;
+	}
+	return 0;
+}
+
+void free_msg(struct msg_msg *msg)
+{
+	struct msg_msgseg *seg;
+
+	security_msg_msg_free(msg);
+
+	seg = msg->next;
+	kfree(msg);
+	while (seg != NULL) {
+		struct msg_msgseg *tmp = seg->next;
+		kfree(seg);
+		seg = tmp;
+	}
+}
diff --git a/ipc/namespace.c b/ipc/namespace.c
new file mode 100644
index 000000000..068caf18d
--- /dev/null
+++ b/ipc/namespace.c
@@ -0,0 +1,175 @@
+/*
+ * linux/ipc/namespace.c
+ * Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc.
+ */
+
+#include <linux/ipc.h>
+#include <linux/msg.h>
+#include <linux/ipc_namespace.h>
+#include <linux/rcupdate.h>
+#include <linux/nsproxy.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
+
+#include "util.h"
+
+static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
+					   struct ipc_namespace *old_ns)
+{
+	struct ipc_namespace *ns;
+	int err;
+
+	ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
+	if (ns == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	err = ns_alloc_inum(&ns->ns);
+	if (err) {
+		kfree(ns);
+		return ERR_PTR(err);
+	}
+	ns->ns.ops = &ipcns_operations;
+
+	atomic_set(&ns->count, 1);
+	err = mq_init_ns(ns);
+	if (err) {
+		ns_free_inum(&ns->ns);
+		kfree(ns);
+		return ERR_PTR(err);
+	}
+	atomic_inc(&nr_ipc_ns);
+
+	sem_init_ns(ns);
+	msg_init_ns(ns);
+	shm_init_ns(ns);
+
+	ns->user_ns = get_user_ns(user_ns);
+
+	return ns;
+}
+
+struct ipc_namespace *copy_ipcs(unsigned long flags,
+	struct user_namespace *user_ns, struct ipc_namespace *ns)
+{
+	if (!(flags & CLONE_NEWIPC))
+		return get_ipc_ns(ns);
+	return create_ipc_ns(user_ns, ns);
+}
+
+/*
+ * free_ipcs - free all ipcs of one type
+ * @ns:   the namespace to remove the ipcs from
+ * @ids:  the table of ipcs to free
+ * @free: the function called to free each individual ipc
+ *
+ * Called for each kind of ipc when an ipc_namespace exits.
+ */
+void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
+	       void (*free)(struct ipc_namespace *, struct kern_ipc_perm *))
+{
+	struct kern_ipc_perm *perm;
+	int next_id;
+	int total, in_use;
+
+	down_write(&ids->rwsem);
+
+	in_use = ids->in_use;
+
+	for (total = 0, next_id = 0; total < in_use; next_id++) {
+		perm = idr_find(&ids->ipcs_idr, next_id);
+		if (perm == NULL)
+			continue;
+		rcu_read_lock();
+		ipc_lock_object(perm);
+		free(ns, perm);
+		total++;
+	}
+	up_write(&ids->rwsem);
+}
+
+static void free_ipc_ns(struct ipc_namespace *ns)
+{
+	sem_exit_ns(ns);
+	msg_exit_ns(ns);
+	shm_exit_ns(ns);
+	atomic_dec(&nr_ipc_ns);
+
+	put_user_ns(ns->user_ns);
+	ns_free_inum(&ns->ns);
+	kfree(ns);
+}
+
+/*
+ * put_ipc_ns - drop a reference to an ipc namespace.
+ * @ns: the namespace to put
+ *
+ * If this is the last task in the namespace exiting, and
+ * it is dropping the refcount to 0, then it can race with
+ * a task in another ipc namespace but in a mounts namespace
+ * which has this ipcns's mqueuefs mounted, doing some action
+ * with one of the mqueuefs files.  That can raise the refcount.
+ * So dropping the refcount, and raising the refcount when
+ * accessing it through the VFS, are protected with mq_lock.
+ *
+ * (Clearly, a task raising the refcount on its own ipc_ns
+ * needn't take mq_lock since it can't race with the last task
+ * in the ipcns exiting).
+ */
+void put_ipc_ns(struct ipc_namespace *ns)
+{
+	if (atomic_dec_and_lock(&ns->count, &mq_lock)) {
+		mq_clear_sbinfo(ns);
+		spin_unlock(&mq_lock);
+		mq_put_mnt(ns);
+		free_ipc_ns(ns);
+	}
+}
+
+static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct ipc_namespace, ns);
+}
+
+static struct ns_common *ipcns_get(struct task_struct *task)
+{
+	struct ipc_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	task_lock(task);
+	nsproxy = task->nsproxy;
+	if (nsproxy)
+		ns = get_ipc_ns(nsproxy->ipc_ns);
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
+static void ipcns_put(struct ns_common *ns)
+{
+	return put_ipc_ns(to_ipc_ns(ns));
+}
+
+static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+	struct ipc_namespace *ns = to_ipc_ns(new);
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* Ditch state from the old ipc namespace */
+	exit_sem(current);
+	put_ipc_ns(nsproxy->ipc_ns);
+	nsproxy->ipc_ns = get_ipc_ns(ns);
+	return 0;
+}
+
+const struct proc_ns_operations ipcns_operations = {
+	.name		= "ipc",
+	.type		= CLONE_NEWIPC,
+	.get		= ipcns_get,
+	.put		= ipcns_put,
+	.install	= ipcns_install,
+};
diff --git a/ipc/sem.c b/ipc/sem.c
new file mode 100644
index 000000000..d1a6edd17
--- /dev/null
+++ b/ipc/sem.c
@@ -0,0 +1,2188 @@
+/*
+ * linux/ipc/sem.c
+ * Copyright (C) 1992 Krishna Balasubramanian
+ * Copyright (C) 1995 Eric Schenk, Bruno Haible
+ *
+ * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
+ *
+ * SMP-threaded, sysctl's added
+ * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
+ * Enforced range limit on SEM_UNDO
+ * (c) 2001 Red Hat Inc
+ * Lockless wakeup
+ * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
+ * Further wakeup optimizations, documentation
+ * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
+ *
+ * support for audit of ipc object properties and permission changes
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
+ *
+ * namespaces support
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ *
+ * Implementation notes: (May 2010)
+ * This file implements System V semaphores.
+ *
+ * User space visible behavior:
+ * - FIFO ordering for semop() operations (just FIFO, not starvation
+ *   protection)
+ * - multiple semaphore operations that alter the same semaphore in
+ *   one semop() are handled.
+ * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
+ *   SETALL calls.
+ * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
+ * - undo adjustments at process exit are limited to 0..SEMVMX.
+ * - namespace are supported.
+ * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
+ *   to /proc/sys/kernel/sem.
+ * - statistics about the usage are reported in /proc/sysvipc/sem.
+ *
+ * Internals:
+ * - scalability:
+ *   - all global variables are read-mostly.
+ *   - semop() calls and semctl(RMID) are synchronized by RCU.
+ *   - most operations do write operations (actually: spin_lock calls) to
+ *     the per-semaphore array structure.
+ *   Thus: Perfect SMP scaling between independent semaphore arrays.
+ *         If multiple semaphores in one array are used, then cache line
+ *         trashing on the semaphore array spinlock will limit the scaling.
+ * - semncnt and semzcnt are calculated on demand in count_semcnt()
+ * - the task that performs a successful semop() scans the list of all
+ *   sleeping tasks and completes any pending operations that can be fulfilled.
+ *   Semaphores are actively given to waiting tasks (necessary for FIFO).
+ *   (see update_queue())
+ * - To improve the scalability, the actual wake-up calls are performed after
+ *   dropping all locks. (see wake_up_sem_queue_prepare(),
+ *   wake_up_sem_queue_do())
+ * - All work is done by the waker, the woken up task does not have to do
+ *   anything - not even acquiring a lock or dropping a refcount.
+ * - A woken up task may not even touch the semaphore array anymore, it may
+ *   have been destroyed already by a semctl(RMID).
+ * - The synchronizations between wake-ups due to a timeout/signal and a
+ *   wake-up due to a completed semaphore operation is achieved by using an
+ *   intermediate state (IN_WAKEUP).
+ * - UNDO values are stored in an array (one per process and per
+ *   semaphore array, lazily allocated). For backwards compatibility, multiple
+ *   modes for the UNDO variables are supported (per process, per thread)
+ *   (see copy_semundo, CLONE_SYSVSEM)
+ * - There are two lists of the pending operations: a per-array list
+ *   and per-semaphore list (stored in the array). This allows to achieve FIFO
+ *   ordering without always scanning all pending operations.
+ *   The worst-case behavior is nevertheless O(N^2) for N wakeups.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/time.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/seq_file.h>
+#include <linux/rwsem.h>
+#include <linux/nsproxy.h>
+#include <linux/ipc_namespace.h>
+
+#include <linux/uaccess.h>
+#include "util.h"
+
+/* One semaphore structure for each semaphore in the system. */
+struct sem {
+	int	semval;		/* current value */
+	int	sempid;		/* pid of last operation */
+	spinlock_t	lock;	/* spinlock for fine-grained semtimedop */
+	struct list_head pending_alter; /* pending single-sop operations */
+					/* that alter the semaphore */
+	struct list_head pending_const; /* pending single-sop operations */
+					/* that do not alter the semaphore*/
+	time_t	sem_otime;	/* candidate for sem_otime */
+} ____cacheline_aligned_in_smp;
+
+/* One queue for each sleeping process in the system. */
+struct sem_queue {
+	struct list_head	list;	 /* queue of pending operations */
+	struct task_struct	*sleeper; /* this process */
+	struct sem_undo		*undo;	 /* undo structure */
+	int			pid;	 /* process id of requesting process */
+	int			status;	 /* completion status of operation */
+	struct sembuf		*sops;	 /* array of pending operations */
+	struct sembuf		*blocking; /* the operation that blocked */
+	int			nsops;	 /* number of operations */
+	int			alter;	 /* does *sops alter the array? */
+};
+
+/* Each task has a list of undo requests. They are executed automatically
+ * when the process exits.
+ */
+struct sem_undo {
+	struct list_head	list_proc;	/* per-process list: *
+						 * all undos from one process
+						 * rcu protected */
+	struct rcu_head		rcu;		/* rcu struct for sem_undo */
+	struct sem_undo_list	*ulp;		/* back ptr to sem_undo_list */
+	struct list_head	list_id;	/* per semaphore array list:
+						 * all undos for one array */
+	int			semid;		/* semaphore set identifier */
+	short			*semadj;	/* array of adjustments */
+						/* one per semaphore */
+};
+
+/* sem_undo_list controls shared access to the list of sem_undo structures
+ * that may be shared among all a CLONE_SYSVSEM task group.
+ */
+struct sem_undo_list {
+	atomic_t		refcnt;
+	spinlock_t		lock;
+	struct list_head	list_proc;
+};
+
+
+#define sem_ids(ns)	((ns)->ids[IPC_SEM_IDS])
+
+#define sem_checkid(sma, semid)	ipc_checkid(&sma->sem_perm, semid)
+
+static int newary(struct ipc_namespace *, struct ipc_params *);
+static void freeary(struct ipc_namespace *, struct kern_ipc_perm *);
+#ifdef CONFIG_PROC_FS
+static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
+#endif
+
+#define SEMMSL_FAST	256 /* 512 bytes on stack */
+#define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
+
+/*
+ * Locking:
+ *	sem_undo.id_next,
+ *	sem_array.complex_count,
+ *	sem_array.pending{_alter,_cont},
+ *	sem_array.sem_undo: global sem_lock() for read/write
+ *	sem_undo.proc_next: only "current" is allowed to read/write that field.
+ *
+ *	sem_array.sem_base[i].pending_{const,alter}:
+ *		global or semaphore sem_lock() for read/write
+ */
+
+#define sc_semmsl	sem_ctls[0]
+#define sc_semmns	sem_ctls[1]
+#define sc_semopm	sem_ctls[2]
+#define sc_semmni	sem_ctls[3]
+
+void sem_init_ns(struct ipc_namespace *ns)
+{
+	ns->sc_semmsl = SEMMSL;
+	ns->sc_semmns = SEMMNS;
+	ns->sc_semopm = SEMOPM;
+	ns->sc_semmni = SEMMNI;
+	ns->used_sems = 0;
+	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
+}
+
+#ifdef CONFIG_IPC_NS
+void sem_exit_ns(struct ipc_namespace *ns)
+{
+	free_ipcs(ns, &sem_ids(ns), freeary);
+	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
+}
+#endif
+
+void __init sem_init(void)
+{
+	sem_init_ns(&init_ipc_ns);
+	ipc_init_proc_interface("sysvipc/sem",
+				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
+				IPC_SEM_IDS, sysvipc_sem_proc_show);
+}
+
+/**
+ * unmerge_queues - unmerge queues, if possible.
+ * @sma: semaphore array
+ *
+ * The function unmerges the wait queues if complex_count is 0.
+ * It must be called prior to dropping the global semaphore array lock.
+ */
+static void unmerge_queues(struct sem_array *sma)
+{
+	struct sem_queue *q, *tq;
+
+	/* complex operations still around? */
+	if (sma->complex_count)
+		return;
+	/*
+	 * We will switch back to simple mode.
+	 * Move all pending operation back into the per-semaphore
+	 * queues.
+	 */
+	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
+		struct sem *curr;
+		curr = &sma->sem_base[q->sops[0].sem_num];
+
+		list_add_tail(&q->list, &curr->pending_alter);
+	}
+	INIT_LIST_HEAD(&sma->pending_alter);
+}
+
+/**
+ * merge_queues - merge single semop queues into global queue
+ * @sma: semaphore array
+ *
+ * This function merges all per-semaphore queues into the global queue.
+ * It is necessary to achieve FIFO ordering for the pending single-sop
+ * operations when a multi-semop operation must sleep.
+ * Only the alter operations must be moved, the const operations can stay.
+ */
+static void merge_queues(struct sem_array *sma)
+{
+	int i;
+	for (i = 0; i < sma->sem_nsems; i++) {
+		struct sem *sem = sma->sem_base + i;
+
+		list_splice_init(&sem->pending_alter, &sma->pending_alter);
+	}
+}
+
+static void sem_rcu_free(struct rcu_head *head)
+{
+	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+	struct sem_array *sma = ipc_rcu_to_struct(p);
+
+	security_sem_free(sma);
+	ipc_rcu_free(head);
+}
+
+/*
+ * Wait until all currently ongoing simple ops have completed.
+ * Caller must own sem_perm.lock.
+ * New simple ops cannot start, because simple ops first check
+ * that sem_perm.lock is free.
+ * that a) sem_perm.lock is free and b) complex_count is 0.
+ */
+static void sem_wait_array(struct sem_array *sma)
+{
+	int i;
+	struct sem *sem;
+
+	if (sma->complex_count)  {
+		/* The thread that increased sma->complex_count waited on
+		 * all sem->lock locks. Thus we don't need to wait again.
+		 */
+		return;
+	}
+
+	for (i = 0; i < sma->sem_nsems; i++) {
+		sem = sma->sem_base + i;
+		spin_unlock_wait(&sem->lock);
+	}
+}
+
+/*
+ * If the request contains only one semaphore operation, and there are
+ * no complex transactions pending, lock only the semaphore involved.
+ * Otherwise, lock the entire semaphore array, since we either have
+ * multiple semaphores in our own semops, or we need to look at
+ * semaphores from other pending complex operations.
+ */
+static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
+			      int nsops)
+{
+	struct sem *sem;
+
+	if (nsops != 1) {
+		/* Complex operation - acquire a full lock */
+		ipc_lock_object(&sma->sem_perm);
+
+		/* And wait until all simple ops that are processed
+		 * right now have dropped their locks.
+		 */
+		sem_wait_array(sma);
+		return -1;
+	}
+
+	/*
+	 * Only one semaphore affected - try to optimize locking.
+	 * The rules are:
+	 * - optimized locking is possible if no complex operation
+	 *   is either enqueued or processed right now.
+	 * - The test for enqueued complex ops is simple:
+	 *      sma->complex_count != 0
+	 * - Testing for complex ops that are processed right now is
+	 *   a bit more difficult. Complex ops acquire the full lock
+	 *   and first wait that the running simple ops have completed.
+	 *   (see above)
+	 *   Thus: If we own a simple lock and the global lock is free
+	 *	and complex_count is now 0, then it will stay 0 and
+	 *	thus just locking sem->lock is sufficient.
+	 */
+	sem = sma->sem_base + sops->sem_num;
+
+	if (sma->complex_count == 0) {
+		/*
+		 * It appears that no complex operation is around.
+		 * Acquire the per-semaphore lock.
+		 */
+		spin_lock(&sem->lock);
+
+		/* Then check that the global lock is free */
+		if (!spin_is_locked(&sma->sem_perm.lock)) {
+			/*
+			 * The ipc object lock check must be visible on all
+			 * cores before rechecking the complex count.  Otherwise
+			 * we can race with  another thread that does:
+			 *	complex_count++;
+			 *	spin_unlock(sem_perm.lock);
+			 */
+			smp_rmb();
+
+			/*
+			 * Now repeat the test of complex_count:
+			 * It can't change anymore until we drop sem->lock.
+			 * Thus: if is now 0, then it will stay 0.
+			 */
+			if (sma->complex_count == 0) {
+				/* fast path successful! */
+				return sops->sem_num;
+			}
+		}
+		spin_unlock(&sem->lock);
+	}
+
+	/* slow path: acquire the full lock */
+	ipc_lock_object(&sma->sem_perm);
+
+	if (sma->complex_count == 0) {
+		/* False alarm:
+		 * There is no complex operation, thus we can switch
+		 * back to the fast path.
+		 */
+		spin_lock(&sem->lock);
+		ipc_unlock_object(&sma->sem_perm);
+		return sops->sem_num;
+	} else {
+		/* Not a false alarm, thus complete the sequence for a
+		 * full lock.
+		 */
+		sem_wait_array(sma);
+		return -1;
+	}
+}
+
+static inline void sem_unlock(struct sem_array *sma, int locknum)
+{
+	if (locknum == -1) {
+		unmerge_queues(sma);
+		ipc_unlock_object(&sma->sem_perm);
+	} else {
+		struct sem *sem = sma->sem_base + locknum;
+		spin_unlock(&sem->lock);
+	}
+}
+
+/*
+ * sem_lock_(check_) routines are called in the paths where the rwsem
+ * is not held.
+ *
+ * The caller holds the RCU read lock.
+ */
+static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
+			int id, struct sembuf *sops, int nsops, int *locknum)
+{
+	struct kern_ipc_perm *ipcp;
+	struct sem_array *sma;
+
+	ipcp = ipc_obtain_object(&sem_ids(ns), id);
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	sma = container_of(ipcp, struct sem_array, sem_perm);
+	*locknum = sem_lock(sma, sops, nsops);
+
+	/* ipc_rmid() may have already freed the ID while sem_lock
+	 * was spinning: verify that the structure is still valid
+	 */
+	if (ipc_valid_object(ipcp))
+		return container_of(ipcp, struct sem_array, sem_perm);
+
+	sem_unlock(sma, *locknum);
+	return ERR_PTR(-EINVAL);
+}
+
+static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object(&sem_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct sem_array, sem_perm);
+}
+
+static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns,
+							int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct sem_array, sem_perm);
+}
+
+static inline void sem_lock_and_putref(struct sem_array *sma)
+{
+	sem_lock(sma, NULL, -1);
+	ipc_rcu_putref(sma, ipc_rcu_free);
+}
+
+static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
+{
+	ipc_rmid(&sem_ids(ns), &s->sem_perm);
+}
+
+/*
+ * Lockless wakeup algorithm:
+ * Without the check/retry algorithm a lockless wakeup is possible:
+ * - queue.status is initialized to -EINTR before blocking.
+ * - wakeup is performed by
+ *	* unlinking the queue entry from the pending list
+ *	* setting queue.status to IN_WAKEUP
+ *	  This is the notification for the blocked thread that a
+ *	  result value is imminent.
+ *	* call wake_up_process
+ *	* set queue.status to the final value.
+ * - the previously blocked thread checks queue.status:
+ *	* if it's IN_WAKEUP, then it must wait until the value changes
+ *	* if it's not -EINTR, then the operation was completed by
+ *	  update_queue. semtimedop can return queue.status without
+ *	  performing any operation on the sem array.
+ *	* otherwise it must acquire the spinlock and check what's up.
+ *
+ * The two-stage algorithm is necessary to protect against the following
+ * races:
+ * - if queue.status is set after wake_up_process, then the woken up idle
+ *   thread could race forward and try (and fail) to acquire sma->lock
+ *   before update_queue had a chance to set queue.status
+ * - if queue.status is written before wake_up_process and if the
+ *   blocked process is woken up by a signal between writing
+ *   queue.status and the wake_up_process, then the woken up
+ *   process could return from semtimedop and die by calling
+ *   sys_exit before wake_up_process is called. Then wake_up_process
+ *   will oops, because the task structure is already invalid.
+ *   (yes, this happened on s390 with sysv msg).
+ *
+ */
+#define IN_WAKEUP	1
+
+/**
+ * newary - Create a new semaphore set
+ * @ns: namespace
+ * @params: ptr to the structure that contains key, semflg and nsems
+ *
+ * Called with sem_ids.rwsem held (as a writer)
+ */
+static int newary(struct ipc_namespace *ns, struct ipc_params *params)
+{
+	int id;
+	int retval;
+	struct sem_array *sma;
+	int size;
+	key_t key = params->key;
+	int nsems = params->u.nsems;
+	int semflg = params->flg;
+	int i;
+
+	if (!nsems)
+		return -EINVAL;
+	if (ns->used_sems + nsems > ns->sc_semmns)
+		return -ENOSPC;
+
+	size = sizeof(*sma) + nsems * sizeof(struct sem);
+	sma = ipc_rcu_alloc(size);
+	if (!sma)
+		return -ENOMEM;
+
+	memset(sma, 0, size);
+
+	sma->sem_perm.mode = (semflg & S_IRWXUGO);
+	sma->sem_perm.key = key;
+
+	sma->sem_perm.security = NULL;
+	retval = security_sem_alloc(sma);
+	if (retval) {
+		ipc_rcu_putref(sma, ipc_rcu_free);
+		return retval;
+	}
+
+	sma->sem_base = (struct sem *) &sma[1];
+
+	for (i = 0; i < nsems; i++) {
+		INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
+		INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
+		spin_lock_init(&sma->sem_base[i].lock);
+	}
+
+	sma->complex_count = 0;
+	INIT_LIST_HEAD(&sma->pending_alter);
+	INIT_LIST_HEAD(&sma->pending_const);
+	INIT_LIST_HEAD(&sma->list_id);
+	sma->sem_nsems = nsems;
+	sma->sem_ctime = get_seconds();
+
+	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
+	if (id < 0) {
+		ipc_rcu_putref(sma, sem_rcu_free);
+		return id;
+	}
+	ns->used_sems += nsems;
+
+	sem_unlock(sma, -1);
+	rcu_read_unlock();
+
+	return sma->sem_perm.id;
+}
+
+
+/*
+ * Called with sem_ids.rwsem and ipcp locked.
+ */
+static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
+{
+	struct sem_array *sma;
+
+	sma = container_of(ipcp, struct sem_array, sem_perm);
+	return security_sem_associate(sma, semflg);
+}
+
+/*
+ * Called with sem_ids.rwsem and ipcp locked.
+ */
+static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
+				struct ipc_params *params)
+{
+	struct sem_array *sma;
+
+	sma = container_of(ipcp, struct sem_array, sem_perm);
+	if (params->u.nsems > sma->sem_nsems)
+		return -EINVAL;
+
+	return 0;
+}
+
+SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
+{
+	struct ipc_namespace *ns;
+	static const struct ipc_ops sem_ops = {
+		.getnew = newary,
+		.associate = sem_security,
+		.more_checks = sem_more_checks,
+	};
+	struct ipc_params sem_params;
+
+	ns = current->nsproxy->ipc_ns;
+
+	if (nsems < 0 || nsems > ns->sc_semmsl)
+		return -EINVAL;
+
+	sem_params.key = key;
+	sem_params.flg = semflg;
+	sem_params.u.nsems = nsems;
+
+	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
+}
+
+/**
+ * perform_atomic_semop - Perform (if possible) a semaphore operation
+ * @sma: semaphore array
+ * @q: struct sem_queue that describes the operation
+ *
+ * Returns 0 if the operation was possible.
+ * Returns 1 if the operation is impossible, the caller must sleep.
+ * Negative values are error codes.
+ */
+static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
+{
+	int result, sem_op, nsops, pid;
+	struct sembuf *sop;
+	struct sem *curr;
+	struct sembuf *sops;
+	struct sem_undo *un;
+
+	sops = q->sops;
+	nsops = q->nsops;
+	un = q->undo;
+
+	for (sop = sops; sop < sops + nsops; sop++) {
+		curr = sma->sem_base + sop->sem_num;
+		sem_op = sop->sem_op;
+		result = curr->semval;
+
+		if (!sem_op && result)
+			goto would_block;
+
+		result += sem_op;
+		if (result < 0)
+			goto would_block;
+		if (result > SEMVMX)
+			goto out_of_range;
+
+		if (sop->sem_flg & SEM_UNDO) {
+			int undo = un->semadj[sop->sem_num] - sem_op;
+			/* Exceeding the undo range is an error. */
+			if (undo < (-SEMAEM - 1) || undo > SEMAEM)
+				goto out_of_range;
+			un->semadj[sop->sem_num] = undo;
+		}
+
+		curr->semval = result;
+	}
+
+	sop--;
+	pid = q->pid;
+	while (sop >= sops) {
+		sma->sem_base[sop->sem_num].sempid = pid;
+		sop--;
+	}
+
+	return 0;
+
+out_of_range:
+	result = -ERANGE;
+	goto undo;
+
+would_block:
+	q->blocking = sop;
+
+	if (sop->sem_flg & IPC_NOWAIT)
+		result = -EAGAIN;
+	else
+		result = 1;
+
+undo:
+	sop--;
+	while (sop >= sops) {
+		sem_op = sop->sem_op;
+		sma->sem_base[sop->sem_num].semval -= sem_op;
+		if (sop->sem_flg & SEM_UNDO)
+			un->semadj[sop->sem_num] += sem_op;
+		sop--;
+	}
+
+	return result;
+}
+
+/** wake_up_sem_queue_prepare(q, error): Prepare wake-up
+ * @q: queue entry that must be signaled
+ * @error: Error value for the signal
+ *
+ * Prepare the wake-up of the queue entry q.
+ */
+static void wake_up_sem_queue_prepare(struct list_head *pt,
+				struct sem_queue *q, int error)
+{
+	if (list_empty(pt)) {
+		/*
+		 * Hold preempt off so that we don't get preempted and have the
+		 * wakee busy-wait until we're scheduled back on.
+		 */
+		preempt_disable();
+	}
+	q->status = IN_WAKEUP;
+	q->pid = error;
+
+	list_add_tail(&q->list, pt);
+}
+
+/**
+ * wake_up_sem_queue_do - do the actual wake-up
+ * @pt: list of tasks to be woken up
+ *
+ * Do the actual wake-up.
+ * The function is called without any locks held, thus the semaphore array
+ * could be destroyed already and the tasks can disappear as soon as the
+ * status is set to the actual return code.
+ */
+static void wake_up_sem_queue_do(struct list_head *pt)
+{
+	struct sem_queue *q, *t;
+	int did_something;
+
+	did_something = !list_empty(pt);
+	list_for_each_entry_safe(q, t, pt, list) {
+		wake_up_process(q->sleeper);
+		/* q can disappear immediately after writing q->status. */
+		smp_wmb();
+		q->status = q->pid;
+	}
+	if (did_something)
+		preempt_enable();
+}
+
+static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
+{
+	list_del(&q->list);
+	if (q->nsops > 1)
+		sma->complex_count--;
+}
+
+/** check_restart(sma, q)
+ * @sma: semaphore array
+ * @q: the operation that just completed
+ *
+ * update_queue is O(N^2) when it restarts scanning the whole queue of
+ * waiting operations. Therefore this function checks if the restart is
+ * really necessary. It is called after a previously waiting operation
+ * modified the array.
+ * Note that wait-for-zero operations are handled without restart.
+ */
+static int check_restart(struct sem_array *sma, struct sem_queue *q)
+{
+	/* pending complex alter operations are too difficult to analyse */
+	if (!list_empty(&sma->pending_alter))
+		return 1;
+
+	/* we were a sleeping complex operation. Too difficult */
+	if (q->nsops > 1)
+		return 1;
+
+	/* It is impossible that someone waits for the new value:
+	 * - complex operations always restart.
+	 * - wait-for-zero are handled seperately.
+	 * - q is a previously sleeping simple operation that
+	 *   altered the array. It must be a decrement, because
+	 *   simple increments never sleep.
+	 * - If there are older (higher priority) decrements
+	 *   in the queue, then they have observed the original
+	 *   semval value and couldn't proceed. The operation
+	 *   decremented to value - thus they won't proceed either.
+	 */
+	return 0;
+}
+
+/**
+ * wake_const_ops - wake up non-alter tasks
+ * @sma: semaphore array.
+ * @semnum: semaphore that was modified.
+ * @pt: list head for the tasks that must be woken up.
+ *
+ * wake_const_ops must be called after a semaphore in a semaphore array
+ * was set to 0. If complex const operations are pending, wake_const_ops must
+ * be called with semnum = -1, as well as with the number of each modified
+ * semaphore.
+ * The tasks that must be woken up are added to @pt. The return code
+ * is stored in q->pid.
+ * The function returns 1 if at least one operation was completed successfully.
+ */
+static int wake_const_ops(struct sem_array *sma, int semnum,
+				struct list_head *pt)
+{
+	struct sem_queue *q;
+	struct list_head *walk;
+	struct list_head *pending_list;
+	int semop_completed = 0;
+
+	if (semnum == -1)
+		pending_list = &sma->pending_const;
+	else
+		pending_list = &sma->sem_base[semnum].pending_const;
+
+	walk = pending_list->next;
+	while (walk != pending_list) {
+		int error;
+
+		q = container_of(walk, struct sem_queue, list);
+		walk = walk->next;
+
+		error = perform_atomic_semop(sma, q);
+
+		if (error <= 0) {
+			/* operation completed, remove from queue & wakeup */
+
+			unlink_queue(sma, q);
+
+			wake_up_sem_queue_prepare(pt, q, error);
+			if (error == 0)
+				semop_completed = 1;
+		}
+	}
+	return semop_completed;
+}
+
+/**
+ * do_smart_wakeup_zero - wakeup all wait for zero tasks
+ * @sma: semaphore array
+ * @sops: operations that were performed
+ * @nsops: number of operations
+ * @pt: list head of the tasks that must be woken up.
+ *
+ * Checks all required queue for wait-for-zero operations, based
+ * on the actual changes that were performed on the semaphore array.
+ * The function returns 1 if at least one operation was completed successfully.
+ */
+static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
+					int nsops, struct list_head *pt)
+{
+	int i;
+	int semop_completed = 0;
+	int got_zero = 0;
+
+	/* first: the per-semaphore queues, if known */
+	if (sops) {
+		for (i = 0; i < nsops; i++) {
+			int num = sops[i].sem_num;
+
+			if (sma->sem_base[num].semval == 0) {
+				got_zero = 1;
+				semop_completed |= wake_const_ops(sma, num, pt);
+			}
+		}
+	} else {
+		/*
+		 * No sops means modified semaphores not known.
+		 * Assume all were changed.
+		 */
+		for (i = 0; i < sma->sem_nsems; i++) {
+			if (sma->sem_base[i].semval == 0) {
+				got_zero = 1;
+				semop_completed |= wake_const_ops(sma, i, pt);
+			}
+		}
+	}
+	/*
+	 * If one of the modified semaphores got 0,
+	 * then check the global queue, too.
+	 */
+	if (got_zero)
+		semop_completed |= wake_const_ops(sma, -1, pt);
+
+	return semop_completed;
+}
+
+
+/**
+ * update_queue - look for tasks that can be completed.
+ * @sma: semaphore array.
+ * @semnum: semaphore that was modified.
+ * @pt: list head for the tasks that must be woken up.
+ *
+ * update_queue must be called after a semaphore in a semaphore array
+ * was modified. If multiple semaphores were modified, update_queue must
+ * be called with semnum = -1, as well as with the number of each modified
+ * semaphore.
+ * The tasks that must be woken up are added to @pt. The return code
+ * is stored in q->pid.
+ * The function internally checks if const operations can now succeed.
+ *
+ * The function return 1 if at least one semop was completed successfully.
+ */
+static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
+{
+	struct sem_queue *q;
+	struct list_head *walk;
+	struct list_head *pending_list;
+	int semop_completed = 0;
+
+	if (semnum == -1)
+		pending_list = &sma->pending_alter;
+	else
+		pending_list = &sma->sem_base[semnum].pending_alter;
+
+again:
+	walk = pending_list->next;
+	while (walk != pending_list) {
+		int error, restart;
+
+		q = container_of(walk, struct sem_queue, list);
+		walk = walk->next;
+
+		/* If we are scanning the single sop, per-semaphore list of
+		 * one semaphore and that semaphore is 0, then it is not
+		 * necessary to scan further: simple increments
+		 * that affect only one entry succeed immediately and cannot
+		 * be in the  per semaphore pending queue, and decrements
+		 * cannot be successful if the value is already 0.
+		 */
+		if (semnum != -1 && sma->sem_base[semnum].semval == 0)
+			break;
+
+		error = perform_atomic_semop(sma, q);
+
+		/* Does q->sleeper still need to sleep? */
+		if (error > 0)
+			continue;
+
+		unlink_queue(sma, q);
+
+		if (error) {
+			restart = 0;
+		} else {
+			semop_completed = 1;
+			do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
+			restart = check_restart(sma, q);
+		}
+
+		wake_up_sem_queue_prepare(pt, q, error);
+		if (restart)
+			goto again;
+	}
+	return semop_completed;
+}
+
+/**
+ * set_semotime - set sem_otime
+ * @sma: semaphore array
+ * @sops: operations that modified the array, may be NULL
+ *
+ * sem_otime is replicated to avoid cache line trashing.
+ * This function sets one instance to the current time.
+ */
+static void set_semotime(struct sem_array *sma, struct sembuf *sops)
+{
+	if (sops == NULL) {
+		sma->sem_base[0].sem_otime = get_seconds();
+	} else {
+		sma->sem_base[sops[0].sem_num].sem_otime =
+							get_seconds();
+	}
+}
+
+/**
+ * do_smart_update - optimized update_queue
+ * @sma: semaphore array
+ * @sops: operations that were performed
+ * @nsops: number of operations
+ * @otime: force setting otime
+ * @pt: list head of the tasks that must be woken up.
+ *
+ * do_smart_update() does the required calls to update_queue and wakeup_zero,
+ * based on the actual changes that were performed on the semaphore array.
+ * Note that the function does not do the actual wake-up: the caller is
+ * responsible for calling wake_up_sem_queue_do(@pt).
+ * It is safe to perform this call after dropping all locks.
+ */
+static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
+			int otime, struct list_head *pt)
+{
+	int i;
+
+	otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
+
+	if (!list_empty(&sma->pending_alter)) {
+		/* semaphore array uses the global queue - just process it. */
+		otime |= update_queue(sma, -1, pt);
+	} else {
+		if (!sops) {
+			/*
+			 * No sops, thus the modified semaphores are not
+			 * known. Check all.
+			 */
+			for (i = 0; i < sma->sem_nsems; i++)
+				otime |= update_queue(sma, i, pt);
+		} else {
+			/*
+			 * Check the semaphores that were increased:
+			 * - No complex ops, thus all sleeping ops are
+			 *   decrease.
+			 * - if we decreased the value, then any sleeping
+			 *   semaphore ops wont be able to run: If the
+			 *   previous value was too small, then the new
+			 *   value will be too small, too.
+			 */
+			for (i = 0; i < nsops; i++) {
+				if (sops[i].sem_op > 0) {
+					otime |= update_queue(sma,
+							sops[i].sem_num, pt);
+				}
+			}
+		}
+	}
+	if (otime)
+		set_semotime(sma, sops);
+}
+
+/*
+ * check_qop: Test if a queued operation sleeps on the semaphore semnum
+ */
+static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q,
+			bool count_zero)
+{
+	struct sembuf *sop = q->blocking;
+
+	/*
+	 * Linux always (since 0.99.10) reported a task as sleeping on all
+	 * semaphores. This violates SUS, therefore it was changed to the
+	 * standard compliant behavior.
+	 * Give the administrators a chance to notice that an application
+	 * might misbehave because it relies on the Linux behavior.
+	 */
+	pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n"
+			"The task %s (%d) triggered the difference, watch for misbehavior.\n",
+			current->comm, task_pid_nr(current));
+
+	if (sop->sem_num != semnum)
+		return 0;
+
+	if (count_zero && sop->sem_op == 0)
+		return 1;
+	if (!count_zero && sop->sem_op < 0)
+		return 1;
+
+	return 0;
+}
+
+/* The following counts are associated to each semaphore:
+ *   semncnt        number of tasks waiting on semval being nonzero
+ *   semzcnt        number of tasks waiting on semval being zero
+ *
+ * Per definition, a task waits only on the semaphore of the first semop
+ * that cannot proceed, even if additional operation would block, too.
+ */
+static int count_semcnt(struct sem_array *sma, ushort semnum,
+			bool count_zero)
+{
+	struct list_head *l;
+	struct sem_queue *q;
+	int semcnt;
+
+	semcnt = 0;
+	/* First: check the simple operations. They are easy to evaluate */
+	if (count_zero)
+		l = &sma->sem_base[semnum].pending_const;
+	else
+		l = &sma->sem_base[semnum].pending_alter;
+
+	list_for_each_entry(q, l, list) {
+		/* all task on a per-semaphore list sleep on exactly
+		 * that semaphore
+		 */
+		semcnt++;
+	}
+
+	/* Then: check the complex operations. */
+	list_for_each_entry(q, &sma->pending_alter, list) {
+		semcnt += check_qop(sma, semnum, q, count_zero);
+	}
+	if (count_zero) {
+		list_for_each_entry(q, &sma->pending_const, list) {
+			semcnt += check_qop(sma, semnum, q, count_zero);
+		}
+	}
+	return semcnt;
+}
+
+/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
+ * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
+ * remains locked on exit.
+ */
+static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+{
+	struct sem_undo *un, *tu;
+	struct sem_queue *q, *tq;
+	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
+	struct list_head tasks;
+	int i;
+
+	/* Free the existing undo structures for this semaphore set.  */
+	ipc_assert_locked_object(&sma->sem_perm);
+	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
+		list_del(&un->list_id);
+		spin_lock(&un->ulp->lock);
+		un->semid = -1;
+		list_del_rcu(&un->list_proc);
+		spin_unlock(&un->ulp->lock);
+		kfree_rcu(un, rcu);
+	}
+
+	/* Wake up all pending processes and let them fail with EIDRM. */
+	INIT_LIST_HEAD(&tasks);
+	list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
+		unlink_queue(sma, q);
+		wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+	}
+
+	list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
+		unlink_queue(sma, q);
+		wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+	}
+	for (i = 0; i < sma->sem_nsems; i++) {
+		struct sem *sem = sma->sem_base + i;
+		list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
+			unlink_queue(sma, q);
+			wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+		}
+		list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
+			unlink_queue(sma, q);
+			wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+		}
+	}
+
+	/* Remove the semaphore set from the IDR */
+	sem_rmid(ns, sma);
+	sem_unlock(sma, -1);
+	rcu_read_unlock();
+
+	wake_up_sem_queue_do(&tasks);
+	ns->used_sems -= sma->sem_nsems;
+	ipc_rcu_putref(sma, sem_rcu_free);
+}
+
+static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version)
+{
+	switch (version) {
+	case IPC_64:
+		return copy_to_user(buf, in, sizeof(*in));
+	case IPC_OLD:
+	    {
+		struct semid_ds out;
+
+		memset(&out, 0, sizeof(out));
+
+		ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm);
+
+		out.sem_otime	= in->sem_otime;
+		out.sem_ctime	= in->sem_ctime;
+		out.sem_nsems	= in->sem_nsems;
+
+		return copy_to_user(buf, &out, sizeof(out));
+	    }
+	default:
+		return -EINVAL;
+	}
+}
+
+static time_t get_semotime(struct sem_array *sma)
+{
+	int i;
+	time_t res;
+
+	res = sma->sem_base[0].sem_otime;
+	for (i = 1; i < sma->sem_nsems; i++) {
+		time_t to = sma->sem_base[i].sem_otime;
+
+		if (to > res)
+			res = to;
+	}
+	return res;
+}
+
+static int semctl_nolock(struct ipc_namespace *ns, int semid,
+			 int cmd, int version, void __user *p)
+{
+	int err;
+	struct sem_array *sma;
+
+	switch (cmd) {
+	case IPC_INFO:
+	case SEM_INFO:
+	{
+		struct seminfo seminfo;
+		int max_id;
+
+		err = security_sem_semctl(NULL, cmd);
+		if (err)
+			return err;
+
+		memset(&seminfo, 0, sizeof(seminfo));
+		seminfo.semmni = ns->sc_semmni;
+		seminfo.semmns = ns->sc_semmns;
+		seminfo.semmsl = ns->sc_semmsl;
+		seminfo.semopm = ns->sc_semopm;
+		seminfo.semvmx = SEMVMX;
+		seminfo.semmnu = SEMMNU;
+		seminfo.semmap = SEMMAP;
+		seminfo.semume = SEMUME;
+		down_read(&sem_ids(ns).rwsem);
+		if (cmd == SEM_INFO) {
+			seminfo.semusz = sem_ids(ns).in_use;
+			seminfo.semaem = ns->used_sems;
+		} else {
+			seminfo.semusz = SEMUSZ;
+			seminfo.semaem = SEMAEM;
+		}
+		max_id = ipc_get_maxid(&sem_ids(ns));
+		up_read(&sem_ids(ns).rwsem);
+		if (copy_to_user(p, &seminfo, sizeof(struct seminfo)))
+			return -EFAULT;
+		return (max_id < 0) ? 0 : max_id;
+	}
+	case IPC_STAT:
+	case SEM_STAT:
+	{
+		struct semid64_ds tbuf;
+		int id = 0;
+
+		memset(&tbuf, 0, sizeof(tbuf));
+
+		rcu_read_lock();
+		if (cmd == SEM_STAT) {
+			sma = sem_obtain_object(ns, semid);
+			if (IS_ERR(sma)) {
+				err = PTR_ERR(sma);
+				goto out_unlock;
+			}
+			id = sma->sem_perm.id;
+		} else {
+			sma = sem_obtain_object_check(ns, semid);
+			if (IS_ERR(sma)) {
+				err = PTR_ERR(sma);
+				goto out_unlock;
+			}
+		}
+
+		err = -EACCES;
+		if (ipcperms(ns, &sma->sem_perm, S_IRUGO))
+			goto out_unlock;
+
+		err = security_sem_semctl(sma, cmd);
+		if (err)
+			goto out_unlock;
+
+		kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
+		tbuf.sem_otime = get_semotime(sma);
+		tbuf.sem_ctime = sma->sem_ctime;
+		tbuf.sem_nsems = sma->sem_nsems;
+		rcu_read_unlock();
+		if (copy_semid_to_user(p, &tbuf, version))
+			return -EFAULT;
+		return id;
+	}
+	default:
+		return -EINVAL;
+	}
+out_unlock:
+	rcu_read_unlock();
+	return err;
+}
+
+static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
+		unsigned long arg)
+{
+	struct sem_undo *un;
+	struct sem_array *sma;
+	struct sem *curr;
+	int err;
+	struct list_head tasks;
+	int val;
+#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
+	/* big-endian 64bit */
+	val = arg >> 32;
+#else
+	/* 32bit or little-endian 64bit */
+	val = arg;
+#endif
+
+	if (val > SEMVMX || val < 0)
+		return -ERANGE;
+
+	INIT_LIST_HEAD(&tasks);
+
+	rcu_read_lock();
+	sma = sem_obtain_object_check(ns, semid);
+	if (IS_ERR(sma)) {
+		rcu_read_unlock();
+		return PTR_ERR(sma);
+	}
+
+	if (semnum < 0 || semnum >= sma->sem_nsems) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+
+	if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) {
+		rcu_read_unlock();
+		return -EACCES;
+	}
+
+	err = security_sem_semctl(sma, SETVAL);
+	if (err) {
+		rcu_read_unlock();
+		return -EACCES;
+	}
+
+	sem_lock(sma, NULL, -1);
+
+	if (!ipc_valid_object(&sma->sem_perm)) {
+		sem_unlock(sma, -1);
+		rcu_read_unlock();
+		return -EIDRM;
+	}
+
+	curr = &sma->sem_base[semnum];
+
+	ipc_assert_locked_object(&sma->sem_perm);
+	list_for_each_entry(un, &sma->list_id, list_id)
+		un->semadj[semnum] = 0;
+
+	curr->semval = val;
+	curr->sempid = task_tgid_vnr(current);
+	sma->sem_ctime = get_seconds();
+	/* maybe some queued-up processes were waiting for this */
+	do_smart_update(sma, NULL, 0, 0, &tasks);
+	sem_unlock(sma, -1);
+	rcu_read_unlock();
+	wake_up_sem_queue_do(&tasks);
+	return 0;
+}
+
+static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
+		int cmd, void __user *p)
+{
+	struct sem_array *sma;
+	struct sem *curr;
+	int err, nsems;
+	ushort fast_sem_io[SEMMSL_FAST];
+	ushort *sem_io = fast_sem_io;
+	struct list_head tasks;
+
+	INIT_LIST_HEAD(&tasks);
+
+	rcu_read_lock();
+	sma = sem_obtain_object_check(ns, semid);
+	if (IS_ERR(sma)) {
+		rcu_read_unlock();
+		return PTR_ERR(sma);
+	}
+
+	nsems = sma->sem_nsems;
+
+	err = -EACCES;
+	if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO))
+		goto out_rcu_wakeup;
+
+	err = security_sem_semctl(sma, cmd);
+	if (err)
+		goto out_rcu_wakeup;
+
+	err = -EACCES;
+	switch (cmd) {
+	case GETALL:
+	{
+		ushort __user *array = p;
+		int i;
+
+		sem_lock(sma, NULL, -1);
+		if (!ipc_valid_object(&sma->sem_perm)) {
+			err = -EIDRM;
+			goto out_unlock;
+		}
+		if (nsems > SEMMSL_FAST) {
+			if (!ipc_rcu_getref(sma)) {
+				err = -EIDRM;
+				goto out_unlock;
+			}
+			sem_unlock(sma, -1);
+			rcu_read_unlock();
+			sem_io = ipc_alloc(sizeof(ushort)*nsems);
+			if (sem_io == NULL) {
+				ipc_rcu_putref(sma, ipc_rcu_free);
+				return -ENOMEM;
+			}
+
+			rcu_read_lock();
+			sem_lock_and_putref(sma);
+			if (!ipc_valid_object(&sma->sem_perm)) {
+				err = -EIDRM;
+				goto out_unlock;
+			}
+		}
+		for (i = 0; i < sma->sem_nsems; i++)
+			sem_io[i] = sma->sem_base[i].semval;
+		sem_unlock(sma, -1);
+		rcu_read_unlock();
+		err = 0;
+		if (copy_to_user(array, sem_io, nsems*sizeof(ushort)))
+			err = -EFAULT;
+		goto out_free;
+	}
+	case SETALL:
+	{
+		int i;
+		struct sem_undo *un;
+
+		if (!ipc_rcu_getref(sma)) {
+			err = -EIDRM;
+			goto out_rcu_wakeup;
+		}
+		rcu_read_unlock();
+
+		if (nsems > SEMMSL_FAST) {
+			sem_io = ipc_alloc(sizeof(ushort)*nsems);
+			if (sem_io == NULL) {
+				ipc_rcu_putref(sma, ipc_rcu_free);
+				return -ENOMEM;
+			}
+		}
+
+		if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) {
+			ipc_rcu_putref(sma, ipc_rcu_free);
+			err = -EFAULT;
+			goto out_free;
+		}
+
+		for (i = 0; i < nsems; i++) {
+			if (sem_io[i] > SEMVMX) {
+				ipc_rcu_putref(sma, ipc_rcu_free);
+				err = -ERANGE;
+				goto out_free;
+			}
+		}
+		rcu_read_lock();
+		sem_lock_and_putref(sma);
+		if (!ipc_valid_object(&sma->sem_perm)) {
+			err = -EIDRM;
+			goto out_unlock;
+		}
+
+		for (i = 0; i < nsems; i++)
+			sma->sem_base[i].semval = sem_io[i];
+
+		ipc_assert_locked_object(&sma->sem_perm);
+		list_for_each_entry(un, &sma->list_id, list_id) {
+			for (i = 0; i < nsems; i++)
+				un->semadj[i] = 0;
+		}
+		sma->sem_ctime = get_seconds();
+		/* maybe some queued-up processes were waiting for this */
+		do_smart_update(sma, NULL, 0, 0, &tasks);
+		err = 0;
+		goto out_unlock;
+	}
+	/* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
+	}
+	err = -EINVAL;
+	if (semnum < 0 || semnum >= nsems)
+		goto out_rcu_wakeup;
+
+	sem_lock(sma, NULL, -1);
+	if (!ipc_valid_object(&sma->sem_perm)) {
+		err = -EIDRM;
+		goto out_unlock;
+	}
+	curr = &sma->sem_base[semnum];
+
+	switch (cmd) {
+	case GETVAL:
+		err = curr->semval;
+		goto out_unlock;
+	case GETPID:
+		err = curr->sempid;
+		goto out_unlock;
+	case GETNCNT:
+		err = count_semcnt(sma, semnum, 0);
+		goto out_unlock;
+	case GETZCNT:
+		err = count_semcnt(sma, semnum, 1);
+		goto out_unlock;
+	}
+
+out_unlock:
+	sem_unlock(sma, -1);
+out_rcu_wakeup:
+	rcu_read_unlock();
+	wake_up_sem_queue_do(&tasks);
+out_free:
+	if (sem_io != fast_sem_io)
+		ipc_free(sem_io, sizeof(ushort)*nsems);
+	return err;
+}
+
+static inline unsigned long
+copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
+{
+	switch (version) {
+	case IPC_64:
+		if (copy_from_user(out, buf, sizeof(*out)))
+			return -EFAULT;
+		return 0;
+	case IPC_OLD:
+	    {
+		struct semid_ds tbuf_old;
+
+		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
+			return -EFAULT;
+
+		out->sem_perm.uid	= tbuf_old.sem_perm.uid;
+		out->sem_perm.gid	= tbuf_old.sem_perm.gid;
+		out->sem_perm.mode	= tbuf_old.sem_perm.mode;
+
+		return 0;
+	    }
+	default:
+		return -EINVAL;
+	}
+}
+
+/*
+ * This function handles some semctl commands which require the rwsem
+ * to be held in write mode.
+ * NOTE: no locks must be held, the rwsem is taken inside this function.
+ */
+static int semctl_down(struct ipc_namespace *ns, int semid,
+		       int cmd, int version, void __user *p)
+{
+	struct sem_array *sma;
+	int err;
+	struct semid64_ds semid64;
+	struct kern_ipc_perm *ipcp;
+
+	if (cmd == IPC_SET) {
+		if (copy_semid_from_user(&semid64, p, version))
+			return -EFAULT;
+	}
+
+	down_write(&sem_ids(ns).rwsem);
+	rcu_read_lock();
+
+	ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
+				      &semid64.sem_perm, 0);
+	if (IS_ERR(ipcp)) {
+		err = PTR_ERR(ipcp);
+		goto out_unlock1;
+	}
+
+	sma = container_of(ipcp, struct sem_array, sem_perm);
+
+	err = security_sem_semctl(sma, cmd);
+	if (err)
+		goto out_unlock1;
+
+	switch (cmd) {
+	case IPC_RMID:
+		sem_lock(sma, NULL, -1);
+		/* freeary unlocks the ipc object and rcu */
+		freeary(ns, ipcp);
+		goto out_up;
+	case IPC_SET:
+		sem_lock(sma, NULL, -1);
+		err = ipc_update_perm(&semid64.sem_perm, ipcp);
+		if (err)
+			goto out_unlock0;
+		sma->sem_ctime = get_seconds();
+		break;
+	default:
+		err = -EINVAL;
+		goto out_unlock1;
+	}
+
+out_unlock0:
+	sem_unlock(sma, -1);
+out_unlock1:
+	rcu_read_unlock();
+out_up:
+	up_write(&sem_ids(ns).rwsem);
+	return err;
+}
+
+SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
+{
+	int version;
+	struct ipc_namespace *ns;
+	void __user *p = (void __user *)arg;
+
+	if (semid < 0)
+		return -EINVAL;
+
+	version = ipc_parse_version(&cmd);
+	ns = current->nsproxy->ipc_ns;
+
+	switch (cmd) {
+	case IPC_INFO:
+	case SEM_INFO:
+	case IPC_STAT:
+	case SEM_STAT:
+		return semctl_nolock(ns, semid, cmd, version, p);
+	case GETALL:
+	case GETVAL:
+	case GETPID:
+	case GETNCNT:
+	case GETZCNT:
+	case SETALL:
+		return semctl_main(ns, semid, semnum, cmd, p);
+	case SETVAL:
+		return semctl_setval(ns, semid, semnum, arg);
+	case IPC_RMID:
+	case IPC_SET:
+		return semctl_down(ns, semid, cmd, version, p);
+	default:
+		return -EINVAL;
+	}
+}
+
+/* If the task doesn't already have a undo_list, then allocate one
+ * here.  We guarantee there is only one thread using this undo list,
+ * and current is THE ONE
+ *
+ * If this allocation and assignment succeeds, but later
+ * portions of this code fail, there is no need to free the sem_undo_list.
+ * Just let it stay associated with the task, and it'll be freed later
+ * at exit time.
+ *
+ * This can block, so callers must hold no locks.
+ */
+static inline int get_undo_list(struct sem_undo_list **undo_listp)
+{
+	struct sem_undo_list *undo_list;
+
+	undo_list = current->sysvsem.undo_list;
+	if (!undo_list) {
+		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
+		if (undo_list == NULL)
+			return -ENOMEM;
+		spin_lock_init(&undo_list->lock);
+		atomic_set(&undo_list->refcnt, 1);
+		INIT_LIST_HEAD(&undo_list->list_proc);
+
+		current->sysvsem.undo_list = undo_list;
+	}
+	*undo_listp = undo_list;
+	return 0;
+}
+
+static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid)
+{
+	struct sem_undo *un;
+
+	list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) {
+		if (un->semid == semid)
+			return un;
+	}
+	return NULL;
+}
+
+static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
+{
+	struct sem_undo *un;
+
+	assert_spin_locked(&ulp->lock);
+
+	un = __lookup_undo(ulp, semid);
+	if (un) {
+		list_del_rcu(&un->list_proc);
+		list_add_rcu(&un->list_proc, &ulp->list_proc);
+	}
+	return un;
+}
+
+/**
+ * find_alloc_undo - lookup (and if not present create) undo array
+ * @ns: namespace
+ * @semid: semaphore array id
+ *
+ * The function looks up (and if not present creates) the undo structure.
+ * The size of the undo structure depends on the size of the semaphore
+ * array, thus the alloc path is not that straightforward.
+ * Lifetime-rules: sem_undo is rcu-protected, on success, the function
+ * performs a rcu_read_lock().
+ */
+static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
+{
+	struct sem_array *sma;
+	struct sem_undo_list *ulp;
+	struct sem_undo *un, *new;
+	int nsems, error;
+
+	error = get_undo_list(&ulp);
+	if (error)
+		return ERR_PTR(error);
+
+	rcu_read_lock();
+	spin_lock(&ulp->lock);
+	un = lookup_undo(ulp, semid);
+	spin_unlock(&ulp->lock);
+	if (likely(un != NULL))
+		goto out;
+
+	/* no undo structure around - allocate one. */
+	/* step 1: figure out the size of the semaphore array */
+	sma = sem_obtain_object_check(ns, semid);
+	if (IS_ERR(sma)) {
+		rcu_read_unlock();
+		return ERR_CAST(sma);
+	}
+
+	nsems = sma->sem_nsems;
+	if (!ipc_rcu_getref(sma)) {
+		rcu_read_unlock();
+		un = ERR_PTR(-EIDRM);
+		goto out;
+	}
+	rcu_read_unlock();
+
+	/* step 2: allocate new undo structure */
+	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
+	if (!new) {
+		ipc_rcu_putref(sma, ipc_rcu_free);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* step 3: Acquire the lock on semaphore array */
+	rcu_read_lock();
+	sem_lock_and_putref(sma);
+	if (!ipc_valid_object(&sma->sem_perm)) {
+		sem_unlock(sma, -1);
+		rcu_read_unlock();
+		kfree(new);
+		un = ERR_PTR(-EIDRM);
+		goto out;
+	}
+	spin_lock(&ulp->lock);
+
+	/*
+	 * step 4: check for races: did someone else allocate the undo struct?
+	 */
+	un = lookup_undo(ulp, semid);
+	if (un) {
+		kfree(new);
+		goto success;
+	}
+	/* step 5: initialize & link new undo structure */
+	new->semadj = (short *) &new[1];
+	new->ulp = ulp;
+	new->semid = semid;
+	assert_spin_locked(&ulp->lock);
+	list_add_rcu(&new->list_proc, &ulp->list_proc);
+	ipc_assert_locked_object(&sma->sem_perm);
+	list_add(&new->list_id, &sma->list_id);
+	un = new;
+
+success:
+	spin_unlock(&ulp->lock);
+	sem_unlock(sma, -1);
+out:
+	return un;
+}
+
+
+/**
+ * get_queue_result - retrieve the result code from sem_queue
+ * @q: Pointer to queue structure
+ *
+ * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
+ * q->status, then we must loop until the value is replaced with the final
+ * value: This may happen if a task is woken up by an unrelated event (e.g.
+ * signal) and in parallel the task is woken up by another task because it got
+ * the requested semaphores.
+ *
+ * The function can be called with or without holding the semaphore spinlock.
+ */
+static int get_queue_result(struct sem_queue *q)
+{
+	int error;
+
+	error = q->status;
+	while (unlikely(error == IN_WAKEUP)) {
+		cpu_relax();
+		error = q->status;
+	}
+
+	return error;
+}
+
+SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
+		unsigned, nsops, const struct timespec __user *, timeout)
+{
+	int error = -EINVAL;
+	struct sem_array *sma;
+	struct sembuf fast_sops[SEMOPM_FAST];
+	struct sembuf *sops = fast_sops, *sop;
+	struct sem_undo *un;
+	int undos = 0, alter = 0, max, locknum;
+	struct sem_queue queue;
+	unsigned long jiffies_left = 0;
+	struct ipc_namespace *ns;
+	struct list_head tasks;
+
+	ns = current->nsproxy->ipc_ns;
+
+	if (nsops < 1 || semid < 0)
+		return -EINVAL;
+	if (nsops > ns->sc_semopm)
+		return -E2BIG;
+	if (nsops > SEMOPM_FAST) {
+		sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL);
+		if (sops == NULL)
+			return -ENOMEM;
+	}
+	if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
+		error =  -EFAULT;
+		goto out_free;
+	}
+	if (timeout) {
+		struct timespec _timeout;
+		if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
+			error = -EFAULT;
+			goto out_free;
+		}
+		if (_timeout.tv_sec < 0 || _timeout.tv_nsec < 0 ||
+			_timeout.tv_nsec >= 1000000000L) {
+			error = -EINVAL;
+			goto out_free;
+		}
+		jiffies_left = timespec_to_jiffies(&_timeout);
+	}
+	max = 0;
+	for (sop = sops; sop < sops + nsops; sop++) {
+		if (sop->sem_num >= max)
+			max = sop->sem_num;
+		if (sop->sem_flg & SEM_UNDO)
+			undos = 1;
+		if (sop->sem_op != 0)
+			alter = 1;
+	}
+
+	INIT_LIST_HEAD(&tasks);
+
+	if (undos) {
+		/* On success, find_alloc_undo takes the rcu_read_lock */
+		un = find_alloc_undo(ns, semid);
+		if (IS_ERR(un)) {
+			error = PTR_ERR(un);
+			goto out_free;
+		}
+	} else {
+		un = NULL;
+		rcu_read_lock();
+	}
+
+	sma = sem_obtain_object_check(ns, semid);
+	if (IS_ERR(sma)) {
+		rcu_read_unlock();
+		error = PTR_ERR(sma);
+		goto out_free;
+	}
+
+	error = -EFBIG;
+	if (max >= sma->sem_nsems)
+		goto out_rcu_wakeup;
+
+	error = -EACCES;
+	if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO))
+		goto out_rcu_wakeup;
+
+	error = security_sem_semop(sma, sops, nsops, alter);
+	if (error)
+		goto out_rcu_wakeup;
+
+	error = -EIDRM;
+	locknum = sem_lock(sma, sops, nsops);
+	/*
+	 * We eventually might perform the following check in a lockless
+	 * fashion, considering ipc_valid_object() locking constraints.
+	 * If nsops == 1 and there is no contention for sem_perm.lock, then
+	 * only a per-semaphore lock is held and it's OK to proceed with the
+	 * check below. More details on the fine grained locking scheme
+	 * entangled here and why it's RMID race safe on comments at sem_lock()
+	 */
+	if (!ipc_valid_object(&sma->sem_perm))
+		goto out_unlock_free;
+	/*
+	 * semid identifiers are not unique - find_alloc_undo may have
+	 * allocated an undo structure, it was invalidated by an RMID
+	 * and now a new array with received the same id. Check and fail.
+	 * This case can be detected checking un->semid. The existence of
+	 * "un" itself is guaranteed by rcu.
+	 */
+	if (un && un->semid == -1)
+		goto out_unlock_free;
+
+	queue.sops = sops;
+	queue.nsops = nsops;
+	queue.undo = un;
+	queue.pid = task_tgid_vnr(current);
+	queue.alter = alter;
+
+	error = perform_atomic_semop(sma, &queue);
+	if (error == 0) {
+		/* If the operation was successful, then do
+		 * the required updates.
+		 */
+		if (alter)
+			do_smart_update(sma, sops, nsops, 1, &tasks);
+		else
+			set_semotime(sma, sops);
+	}
+	if (error <= 0)
+		goto out_unlock_free;
+
+	/* We need to sleep on this operation, so we put the current
+	 * task into the pending queue and go to sleep.
+	 */
+
+	if (nsops == 1) {
+		struct sem *curr;
+		curr = &sma->sem_base[sops->sem_num];
+
+		if (alter) {
+			if (sma->complex_count) {
+				list_add_tail(&queue.list,
+						&sma->pending_alter);
+			} else {
+
+				list_add_tail(&queue.list,
+						&curr->pending_alter);
+			}
+		} else {
+			list_add_tail(&queue.list, &curr->pending_const);
+		}
+	} else {
+		if (!sma->complex_count)
+			merge_queues(sma);
+
+		if (alter)
+			list_add_tail(&queue.list, &sma->pending_alter);
+		else
+			list_add_tail(&queue.list, &sma->pending_const);
+
+		sma->complex_count++;
+	}
+
+	queue.status = -EINTR;
+	queue.sleeper = current;
+
+sleep_again:
+	__set_current_state(TASK_INTERRUPTIBLE);
+	sem_unlock(sma, locknum);
+	rcu_read_unlock();
+
+	if (timeout)
+		jiffies_left = schedule_timeout(jiffies_left);
+	else
+		schedule();
+
+	error = get_queue_result(&queue);
+
+	if (error != -EINTR) {
+		/* fast path: update_queue already obtained all requested
+		 * resources.
+		 * Perform a smp_mb(): User space could assume that semop()
+		 * is a memory barrier: Without the mb(), the cpu could
+		 * speculatively read in user space stale data that was
+		 * overwritten by the previous owner of the semaphore.
+		 */
+		smp_mb();
+
+		goto out_free;
+	}
+
+	rcu_read_lock();
+	sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum);
+
+	/*
+	 * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
+	 */
+	error = get_queue_result(&queue);
+
+	/*
+	 * Array removed? If yes, leave without sem_unlock().
+	 */
+	if (IS_ERR(sma)) {
+		rcu_read_unlock();
+		goto out_free;
+	}
+
+
+	/*
+	 * If queue.status != -EINTR we are woken up by another process.
+	 * Leave without unlink_queue(), but with sem_unlock().
+	 */
+	if (error != -EINTR)
+		goto out_unlock_free;
+
+	/*
+	 * If an interrupt occurred we have to clean up the queue
+	 */
+	if (timeout && jiffies_left == 0)
+		error = -EAGAIN;
+
+	/*
+	 * If the wakeup was spurious, just retry
+	 */
+	if (error == -EINTR && !signal_pending(current))
+		goto sleep_again;
+
+	unlink_queue(sma, &queue);
+
+out_unlock_free:
+	sem_unlock(sma, locknum);
+out_rcu_wakeup:
+	rcu_read_unlock();
+	wake_up_sem_queue_do(&tasks);
+out_free:
+	if (sops != fast_sops)
+		kfree(sops);
+	return error;
+}
+
+SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
+		unsigned, nsops)
+{
+	return sys_semtimedop(semid, tsops, nsops, NULL);
+}
+
+/* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
+ * parent and child tasks.
+ */
+
+int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+{
+	struct sem_undo_list *undo_list;
+	int error;
+
+	if (clone_flags & CLONE_SYSVSEM) {
+		error = get_undo_list(&undo_list);
+		if (error)
+			return error;
+		atomic_inc(&undo_list->refcnt);
+		tsk->sysvsem.undo_list = undo_list;
+	} else
+		tsk->sysvsem.undo_list = NULL;
+
+	return 0;
+}
+
+/*
+ * add semadj values to semaphores, free undo structures.
+ * undo structures are not freed when semaphore arrays are destroyed
+ * so some of them may be out of date.
+ * IMPLEMENTATION NOTE: There is some confusion over whether the
+ * set of adjustments that needs to be done should be done in an atomic
+ * manner or not. That is, if we are attempting to decrement the semval
+ * should we queue up and wait until we can do so legally?
+ * The original implementation attempted to do this (queue and wait).
+ * The current implementation does not do so. The POSIX standard
+ * and SVID should be consulted to determine what behavior is mandated.
+ */
+void exit_sem(struct task_struct *tsk)
+{
+	struct sem_undo_list *ulp;
+
+	ulp = tsk->sysvsem.undo_list;
+	if (!ulp)
+		return;
+	tsk->sysvsem.undo_list = NULL;
+
+	if (!atomic_dec_and_test(&ulp->refcnt))
+		return;
+
+	for (;;) {
+		struct sem_array *sma;
+		struct sem_undo *un;
+		struct list_head tasks;
+		int semid, i;
+
+		rcu_read_lock();
+		un = list_entry_rcu(ulp->list_proc.next,
+				    struct sem_undo, list_proc);
+		if (&un->list_proc == &ulp->list_proc)
+			semid = -1;
+		 else
+			semid = un->semid;
+
+		if (semid == -1) {
+			rcu_read_unlock();
+			break;
+		}
+
+		sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, un->semid);
+		/* exit_sem raced with IPC_RMID, nothing to do */
+		if (IS_ERR(sma)) {
+			rcu_read_unlock();
+			continue;
+		}
+
+		sem_lock(sma, NULL, -1);
+		/* exit_sem raced with IPC_RMID, nothing to do */
+		if (!ipc_valid_object(&sma->sem_perm)) {
+			sem_unlock(sma, -1);
+			rcu_read_unlock();
+			continue;
+		}
+		un = __lookup_undo(ulp, semid);
+		if (un == NULL) {
+			/* exit_sem raced with IPC_RMID+semget() that created
+			 * exactly the same semid. Nothing to do.
+			 */
+			sem_unlock(sma, -1);
+			rcu_read_unlock();
+			continue;
+		}
+
+		/* remove un from the linked lists */
+		ipc_assert_locked_object(&sma->sem_perm);
+		list_del(&un->list_id);
+
+		spin_lock(&ulp->lock);
+		list_del_rcu(&un->list_proc);
+		spin_unlock(&ulp->lock);
+
+		/* perform adjustments registered in un */
+		for (i = 0; i < sma->sem_nsems; i++) {
+			struct sem *semaphore = &sma->sem_base[i];
+			if (un->semadj[i]) {
+				semaphore->semval += un->semadj[i];
+				/*
+				 * Range checks of the new semaphore value,
+				 * not defined by sus:
+				 * - Some unices ignore the undo entirely
+				 *   (e.g. HP UX 11i 11.22, Tru64 V5.1)
+				 * - some cap the value (e.g. FreeBSD caps
+				 *   at 0, but doesn't enforce SEMVMX)
+				 *
+				 * Linux caps the semaphore value, both at 0
+				 * and at SEMVMX.
+				 *
+				 *	Manfred <manfred@colorfullife.com>
+				 */
+				if (semaphore->semval < 0)
+					semaphore->semval = 0;
+				if (semaphore->semval > SEMVMX)
+					semaphore->semval = SEMVMX;
+				semaphore->sempid = task_tgid_vnr(current);
+			}
+		}
+		/* maybe some queued-up processes were waiting for this */
+		INIT_LIST_HEAD(&tasks);
+		do_smart_update(sma, NULL, 0, 1, &tasks);
+		sem_unlock(sma, -1);
+		rcu_read_unlock();
+		wake_up_sem_queue_do(&tasks);
+
+		kfree_rcu(un, rcu);
+	}
+	kfree(ulp);
+}
+
+#ifdef CONFIG_PROC_FS
+static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
+{
+	struct user_namespace *user_ns = seq_user_ns(s);
+	struct sem_array *sma = it;
+	time_t sem_otime;
+
+	/*
+	 * The proc interface isn't aware of sem_lock(), it calls
+	 * ipc_lock_object() directly (in sysvipc_find_ipc).
+	 * In order to stay compatible with sem_lock(), we must wait until
+	 * all simple semop() calls have left their critical regions.
+	 */
+	sem_wait_array(sma);
+
+	sem_otime = get_semotime(sma);
+
+	seq_printf(s,
+		   "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
+		   sma->sem_perm.key,
+		   sma->sem_perm.id,
+		   sma->sem_perm.mode,
+		   sma->sem_nsems,
+		   from_kuid_munged(user_ns, sma->sem_perm.uid),
+		   from_kgid_munged(user_ns, sma->sem_perm.gid),
+		   from_kuid_munged(user_ns, sma->sem_perm.cuid),
+		   from_kgid_munged(user_ns, sma->sem_perm.cgid),
+		   sem_otime,
+		   sma->sem_ctime);
+
+	return 0;
+}
+#endif
diff --git a/ipc/shm.c b/ipc/shm.c
new file mode 100644
index 000000000..d2f284c1f
--- /dev/null
+++ b/ipc/shm.c
@@ -0,0 +1,1368 @@
+/*
+ * linux/ipc/shm.c
+ * Copyright (C) 1992, 1993 Krishna Balasubramanian
+ *	 Many improvements/fixes by Bruno Haible.
+ * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
+ * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
+ *
+ * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
+ * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
+ * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
+ * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
+ * Make shmmax, shmall, shmmni sysctl'able, Christoph Rohland <cr@sap.com>
+ * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
+ * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
+ *
+ * support for audit of ipc object properties and permission changes
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
+ *
+ * namespaces support
+ * OpenVZ, SWsoft Inc.
+ * Pavel Emelianov <xemul@openvz.org>
+ *
+ * Better ipc lock (kern_ipc_perm.lock) handling
+ * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
+ */
+
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/shm.h>
+#include <linux/init.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/shmem_fs.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/audit.h>
+#include <linux/capability.h>
+#include <linux/ptrace.h>
+#include <linux/seq_file.h>
+#include <linux/rwsem.h>
+#include <linux/nsproxy.h>
+#include <linux/mount.h>
+#include <linux/ipc_namespace.h>
+
+#include <linux/uaccess.h>
+
+#include "util.h"
+
+struct shm_file_data {
+	int id;
+	struct ipc_namespace *ns;
+	struct file *file;
+	const struct vm_operations_struct *vm_ops;
+};
+
+#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
+
+static const struct file_operations shm_file_operations;
+static const struct vm_operations_struct shm_vm_ops;
+
+#define shm_ids(ns)	((ns)->ids[IPC_SHM_IDS])
+
+#define shm_unlock(shp)			\
+	ipc_unlock(&(shp)->shm_perm)
+
+static int newseg(struct ipc_namespace *, struct ipc_params *);
+static void shm_open(struct vm_area_struct *vma);
+static void shm_close(struct vm_area_struct *vma);
+static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
+#ifdef CONFIG_PROC_FS
+static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
+#endif
+
+void shm_init_ns(struct ipc_namespace *ns)
+{
+	ns->shm_ctlmax = SHMMAX;
+	ns->shm_ctlall = SHMALL;
+	ns->shm_ctlmni = SHMMNI;
+	ns->shm_rmid_forced = 0;
+	ns->shm_tot = 0;
+	ipc_init_ids(&shm_ids(ns));
+}
+
+/*
+ * Called with shm_ids.rwsem (writer) and the shp structure locked.
+ * Only shm_ids.rwsem remains locked on exit.
+ */
+static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
+{
+	struct shmid_kernel *shp;
+	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+
+	if (shp->shm_nattch) {
+		shp->shm_perm.mode |= SHM_DEST;
+		/* Do not find it any more */
+		shp->shm_perm.key = IPC_PRIVATE;
+		shm_unlock(shp);
+	} else
+		shm_destroy(ns, shp);
+}
+
+#ifdef CONFIG_IPC_NS
+void shm_exit_ns(struct ipc_namespace *ns)
+{
+	free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
+	idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
+}
+#endif
+
+static int __init ipc_ns_init(void)
+{
+	shm_init_ns(&init_ipc_ns);
+	return 0;
+}
+
+pure_initcall(ipc_ns_init);
+
+void __init shm_init(void)
+{
+	ipc_init_proc_interface("sysvipc/shm",
+#if BITS_PER_LONG <= 32
+				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
+#else
+				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
+#endif
+				IPC_SHM_IDS, sysvipc_shm_proc_show);
+}
+
+static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
+static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
+/*
+ * shm_lock_(check_) routines are called in the paths where the rwsem
+ * is not necessarily held.
+ */
+static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return (struct shmid_kernel *)ipcp;
+
+	return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
+static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
+{
+	rcu_read_lock();
+	ipc_lock_object(&ipcp->shm_perm);
+}
+
+static void shm_rcu_free(struct rcu_head *head)
+{
+	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+	struct shmid_kernel *shp = ipc_rcu_to_struct(p);
+
+	security_shm_free(shp);
+	ipc_rcu_free(head);
+}
+
+static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
+{
+	list_del(&s->shm_clist);
+	ipc_rmid(&shm_ids(ns), &s->shm_perm);
+}
+
+
+/* This is called by fork, once for every shm attach. */
+static void shm_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct shm_file_data *sfd = shm_file_data(file);
+	struct shmid_kernel *shp;
+
+	shp = shm_lock(sfd->ns, sfd->id);
+	BUG_ON(IS_ERR(shp));
+	shp->shm_atim = get_seconds();
+	shp->shm_lprid = task_tgid_vnr(current);
+	shp->shm_nattch++;
+	shm_unlock(shp);
+}
+
+/*
+ * shm_destroy - free the struct shmid_kernel
+ *
+ * @ns: namespace
+ * @shp: struct to free
+ *
+ * It has to be called with shp and shm_ids.rwsem (writer) locked,
+ * but returns with shp unlocked and freed.
+ */
+static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+{
+	struct file *shm_file;
+
+	shm_file = shp->shm_file;
+	shp->shm_file = NULL;
+	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	shm_rmid(ns, shp);
+	shm_unlock(shp);
+	if (!is_file_hugepages(shm_file))
+		shmem_lock(shm_file, 0, shp->mlock_user);
+	else if (shp->mlock_user)
+		user_shm_unlock(i_size_read(file_inode(shm_file)),
+				shp->mlock_user);
+	fput(shm_file);
+	ipc_rcu_putref(shp, shm_rcu_free);
+}
+
+/*
+ * shm_may_destroy - identifies whether shm segment should be destroyed now
+ *
+ * Returns true if and only if there are no active users of the segment and
+ * one of the following is true:
+ *
+ * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
+ *
+ * 2) sysctl kernel.shm_rmid_forced is set to 1.
+ */
+static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
+{
+	return (shp->shm_nattch == 0) &&
+	       (ns->shm_rmid_forced ||
+		(shp->shm_perm.mode & SHM_DEST));
+}
+
+/*
+ * remove the attach descriptor vma.
+ * free memory for segment if it is marked destroyed.
+ * The descriptor has already been removed from the current->mm->mmap list
+ * and will later be kfree()d.
+ */
+static void shm_close(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct shm_file_data *sfd = shm_file_data(file);
+	struct shmid_kernel *shp;
+	struct ipc_namespace *ns = sfd->ns;
+
+	down_write(&shm_ids(ns).rwsem);
+	/* remove from the list of attaches of the shm segment */
+	shp = shm_lock(ns, sfd->id);
+	BUG_ON(IS_ERR(shp));
+	shp->shm_lprid = task_tgid_vnr(current);
+	shp->shm_dtim = get_seconds();
+	shp->shm_nattch--;
+	if (shm_may_destroy(ns, shp))
+		shm_destroy(ns, shp);
+	else
+		shm_unlock(shp);
+	up_write(&shm_ids(ns).rwsem);
+}
+
+/* Called with ns->shm_ids(ns).rwsem locked */
+static int shm_try_destroy_orphaned(int id, void *p, void *data)
+{
+	struct ipc_namespace *ns = data;
+	struct kern_ipc_perm *ipcp = p;
+	struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+
+	/*
+	 * We want to destroy segments without users and with already
+	 * exit'ed originating process.
+	 *
+	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
+	 */
+	if (shp->shm_creator != NULL)
+		return 0;
+
+	if (shm_may_destroy(ns, shp)) {
+		shm_lock_by_ptr(shp);
+		shm_destroy(ns, shp);
+	}
+	return 0;
+}
+
+void shm_destroy_orphaned(struct ipc_namespace *ns)
+{
+	down_write(&shm_ids(ns).rwsem);
+	if (shm_ids(ns).in_use)
+		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
+	up_write(&shm_ids(ns).rwsem);
+}
+
+/* Locking assumes this will only be called with task == current */
+void exit_shm(struct task_struct *task)
+{
+	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
+	struct shmid_kernel *shp, *n;
+
+	if (list_empty(&task->sysvshm.shm_clist))
+		return;
+
+	/*
+	 * If kernel.shm_rmid_forced is not set then only keep track of
+	 * which shmids are orphaned, so that a later set of the sysctl
+	 * can clean them up.
+	 */
+	if (!ns->shm_rmid_forced) {
+		down_read(&shm_ids(ns).rwsem);
+		list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
+			shp->shm_creator = NULL;
+		/*
+		 * Only under read lock but we are only called on current
+		 * so no entry on the list will be shared.
+		 */
+		list_del(&task->sysvshm.shm_clist);
+		up_read(&shm_ids(ns).rwsem);
+		return;
+	}
+
+	/*
+	 * Destroy all already created segments, that were not yet mapped,
+	 * and mark any mapped as orphan to cover the sysctl toggling.
+	 * Destroy is skipped if shm_may_destroy() returns false.
+	 */
+	down_write(&shm_ids(ns).rwsem);
+	list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
+		shp->shm_creator = NULL;
+
+		if (shm_may_destroy(ns, shp)) {
+			shm_lock_by_ptr(shp);
+			shm_destroy(ns, shp);
+		}
+	}
+
+	/* Remove the list head from any segments still attached. */
+	list_del(&task->sysvshm.shm_clist);
+	up_write(&shm_ids(ns).rwsem);
+}
+
+static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct shm_file_data *sfd = shm_file_data(file);
+
+	return sfd->vm_ops->fault(vma, vmf);
+}
+
+#ifdef CONFIG_NUMA
+static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct shm_file_data *sfd = shm_file_data(file);
+	int err = 0;
+	if (sfd->vm_ops->set_policy)
+		err = sfd->vm_ops->set_policy(vma, new);
+	return err;
+}
+
+static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
+					unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct shm_file_data *sfd = shm_file_data(file);
+	struct mempolicy *pol = NULL;
+
+	if (sfd->vm_ops->get_policy)
+		pol = sfd->vm_ops->get_policy(vma, addr);
+	else if (vma->vm_policy)
+		pol = vma->vm_policy;
+
+	return pol;
+}
+#endif
+
+static int shm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct shm_file_data *sfd = shm_file_data(file);
+	int ret;
+
+	ret = sfd->file->f_op->mmap(sfd->file, vma);
+	if (ret != 0)
+		return ret;
+	sfd->vm_ops = vma->vm_ops;
+#ifdef CONFIG_MMU
+	BUG_ON(!sfd->vm_ops->fault);
+#endif
+	vma->vm_ops = &shm_vm_ops;
+	shm_open(vma);
+
+	return ret;
+}
+
+static int shm_release(struct inode *ino, struct file *file)
+{
+	struct shm_file_data *sfd = shm_file_data(file);
+
+	put_ipc_ns(sfd->ns);
+	shm_file_data(file) = NULL;
+	kfree(sfd);
+	return 0;
+}
+
+static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct shm_file_data *sfd = shm_file_data(file);
+
+	if (!sfd->file->f_op->fsync)
+		return -EINVAL;
+	return sfd->file->f_op->fsync(sfd->file, start, end, datasync);
+}
+
+static long shm_fallocate(struct file *file, int mode, loff_t offset,
+			  loff_t len)
+{
+	struct shm_file_data *sfd = shm_file_data(file);
+
+	if (!sfd->file->f_op->fallocate)
+		return -EOPNOTSUPP;
+	return sfd->file->f_op->fallocate(file, mode, offset, len);
+}
+
+static unsigned long shm_get_unmapped_area(struct file *file,
+	unsigned long addr, unsigned long len, unsigned long pgoff,
+	unsigned long flags)
+{
+	struct shm_file_data *sfd = shm_file_data(file);
+	return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
+						pgoff, flags);
+}
+
+static const struct file_operations shm_file_operations = {
+	.mmap		= shm_mmap,
+	.fsync		= shm_fsync,
+	.release	= shm_release,
+#ifndef CONFIG_MMU
+	.get_unmapped_area	= shm_get_unmapped_area,
+#endif
+	.llseek		= noop_llseek,
+	.fallocate	= shm_fallocate,
+};
+
+static const struct file_operations shm_file_operations_huge = {
+	.mmap		= shm_mmap,
+	.fsync		= shm_fsync,
+	.release	= shm_release,
+	.get_unmapped_area	= shm_get_unmapped_area,
+	.llseek		= noop_llseek,
+	.fallocate	= shm_fallocate,
+};
+
+int is_file_shm_hugepages(struct file *file)
+{
+	return file->f_op == &shm_file_operations_huge;
+}
+
+static const struct vm_operations_struct shm_vm_ops = {
+	.open	= shm_open,	/* callback for a new vm-area open */
+	.close	= shm_close,	/* callback for when the vm-area is released */
+	.fault	= shm_fault,
+#if defined(CONFIG_NUMA)
+	.set_policy = shm_set_policy,
+	.get_policy = shm_get_policy,
+#endif
+};
+
+/**
+ * newseg - Create a new shared memory segment
+ * @ns: namespace
+ * @params: ptr to the structure that contains key, size and shmflg
+ *
+ * Called with shm_ids.rwsem held as a writer.
+ */
+static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
+{
+	key_t key = params->key;
+	int shmflg = params->flg;
+	size_t size = params->u.size;
+	int error;
+	struct shmid_kernel *shp;
+	size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	struct file *file;
+	char name[13];
+	int id;
+	vm_flags_t acctflag = 0;
+
+	if (size < SHMMIN || size > ns->shm_ctlmax)
+		return -EINVAL;
+
+	if (numpages << PAGE_SHIFT < size)
+		return -ENOSPC;
+
+	if (ns->shm_tot + numpages < ns->shm_tot ||
+			ns->shm_tot + numpages > ns->shm_ctlall)
+		return -ENOSPC;
+
+	shp = ipc_rcu_alloc(sizeof(*shp));
+	if (!shp)
+		return -ENOMEM;
+
+	shp->shm_perm.key = key;
+	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
+	shp->mlock_user = NULL;
+
+	shp->shm_perm.security = NULL;
+	error = security_shm_alloc(shp);
+	if (error) {
+		ipc_rcu_putref(shp, ipc_rcu_free);
+		return error;
+	}
+
+	sprintf(name, "SYSV%08x", key);
+	if (shmflg & SHM_HUGETLB) {
+		struct hstate *hs;
+		size_t hugesize;
+
+		hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
+		if (!hs) {
+			error = -EINVAL;
+			goto no_file;
+		}
+		hugesize = ALIGN(size, huge_page_size(hs));
+
+		/* hugetlb_file_setup applies strict accounting */
+		if (shmflg & SHM_NORESERVE)
+			acctflag = VM_NORESERVE;
+		file = hugetlb_file_setup(name, hugesize, acctflag,
+				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
+				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
+	} else {
+		/*
+		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
+		 * if it's asked for.
+		 */
+		if  ((shmflg & SHM_NORESERVE) &&
+				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+			acctflag = VM_NORESERVE;
+		file = shmem_file_setup(name, size, acctflag, 0);
+	}
+	error = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto no_file;
+
+	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
+	if (id < 0) {
+		error = id;
+		goto no_id;
+	}
+
+	shp->shm_cprid = task_tgid_vnr(current);
+	shp->shm_lprid = 0;
+	shp->shm_atim = shp->shm_dtim = 0;
+	shp->shm_ctim = get_seconds();
+	shp->shm_segsz = size;
+	shp->shm_nattch = 0;
+	shp->shm_file = file;
+	shp->shm_creator = current;
+	list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
+
+	/*
+	 * shmid gets reported as "inode#" in /proc/pid/maps.
+	 * proc-ps tools use this. Changing this will break them.
+	 */
+	file_inode(file)->i_ino = shp->shm_perm.id;
+
+	ns->shm_tot += numpages;
+	error = shp->shm_perm.id;
+
+	ipc_unlock_object(&shp->shm_perm);
+	rcu_read_unlock();
+	return error;
+
+no_id:
+	if (is_file_hugepages(file) && shp->mlock_user)
+		user_shm_unlock(size, shp->mlock_user);
+	fput(file);
+no_file:
+	ipc_rcu_putref(shp, shm_rcu_free);
+	return error;
+}
+
+/*
+ * Called with shm_ids.rwsem and ipcp locked.
+ */
+static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
+{
+	struct shmid_kernel *shp;
+
+	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+	return security_shm_associate(shp, shmflg);
+}
+
+/*
+ * Called with shm_ids.rwsem and ipcp locked.
+ */
+static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
+				struct ipc_params *params)
+{
+	struct shmid_kernel *shp;
+
+	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+	if (shp->shm_segsz < params->u.size)
+		return -EINVAL;
+
+	return 0;
+}
+
+SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
+{
+	struct ipc_namespace *ns;
+	static const struct ipc_ops shm_ops = {
+		.getnew = newseg,
+		.associate = shm_security,
+		.more_checks = shm_more_checks,
+	};
+	struct ipc_params shm_params;
+
+	ns = current->nsproxy->ipc_ns;
+
+	shm_params.key = key;
+	shm_params.flg = shmflg;
+	shm_params.u.size = size;
+
+	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
+}
+
+static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version)
+{
+	switch (version) {
+	case IPC_64:
+		return copy_to_user(buf, in, sizeof(*in));
+	case IPC_OLD:
+	    {
+		struct shmid_ds out;
+
+		memset(&out, 0, sizeof(out));
+		ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
+		out.shm_segsz	= in->shm_segsz;
+		out.shm_atime	= in->shm_atime;
+		out.shm_dtime	= in->shm_dtime;
+		out.shm_ctime	= in->shm_ctime;
+		out.shm_cpid	= in->shm_cpid;
+		out.shm_lpid	= in->shm_lpid;
+		out.shm_nattch	= in->shm_nattch;
+
+		return copy_to_user(buf, &out, sizeof(out));
+	    }
+	default:
+		return -EINVAL;
+	}
+}
+
+static inline unsigned long
+copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version)
+{
+	switch (version) {
+	case IPC_64:
+		if (copy_from_user(out, buf, sizeof(*out)))
+			return -EFAULT;
+		return 0;
+	case IPC_OLD:
+	    {
+		struct shmid_ds tbuf_old;
+
+		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
+			return -EFAULT;
+
+		out->shm_perm.uid	= tbuf_old.shm_perm.uid;
+		out->shm_perm.gid	= tbuf_old.shm_perm.gid;
+		out->shm_perm.mode	= tbuf_old.shm_perm.mode;
+
+		return 0;
+	    }
+	default:
+		return -EINVAL;
+	}
+}
+
+static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version)
+{
+	switch (version) {
+	case IPC_64:
+		return copy_to_user(buf, in, sizeof(*in));
+	case IPC_OLD:
+	    {
+		struct shminfo out;
+
+		if (in->shmmax > INT_MAX)
+			out.shmmax = INT_MAX;
+		else
+			out.shmmax = (int)in->shmmax;
+
+		out.shmmin	= in->shmmin;
+		out.shmmni	= in->shmmni;
+		out.shmseg	= in->shmseg;
+		out.shmall	= in->shmall;
+
+		return copy_to_user(buf, &out, sizeof(out));
+	    }
+	default:
+		return -EINVAL;
+	}
+}
+
+/*
+ * Calculate and add used RSS and swap pages of a shm.
+ * Called with shm_ids.rwsem held as a reader
+ */
+static void shm_add_rss_swap(struct shmid_kernel *shp,
+	unsigned long *rss_add, unsigned long *swp_add)
+{
+	struct inode *inode;
+
+	inode = file_inode(shp->shm_file);
+
+	if (is_file_hugepages(shp->shm_file)) {
+		struct address_space *mapping = inode->i_mapping;
+		struct hstate *h = hstate_file(shp->shm_file);
+		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
+	} else {
+#ifdef CONFIG_SHMEM
+		struct shmem_inode_info *info = SHMEM_I(inode);
+		spin_lock(&info->lock);
+		*rss_add += inode->i_mapping->nrpages;
+		*swp_add += info->swapped;
+		spin_unlock(&info->lock);
+#else
+		*rss_add += inode->i_mapping->nrpages;
+#endif
+	}
+}
+
+/*
+ * Called with shm_ids.rwsem held as a reader
+ */
+static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
+		unsigned long *swp)
+{
+	int next_id;
+	int total, in_use;
+
+	*rss = 0;
+	*swp = 0;
+
+	in_use = shm_ids(ns).in_use;
+
+	for (total = 0, next_id = 0; total < in_use; next_id++) {
+		struct kern_ipc_perm *ipc;
+		struct shmid_kernel *shp;
+
+		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
+		if (ipc == NULL)
+			continue;
+		shp = container_of(ipc, struct shmid_kernel, shm_perm);
+
+		shm_add_rss_swap(shp, rss, swp);
+
+		total++;
+	}
+}
+
+/*
+ * This function handles some shmctl commands which require the rwsem
+ * to be held in write mode.
+ * NOTE: no locks must be held, the rwsem is taken inside this function.
+ */
+static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
+		       struct shmid_ds __user *buf, int version)
+{
+	struct kern_ipc_perm *ipcp;
+	struct shmid64_ds shmid64;
+	struct shmid_kernel *shp;
+	int err;
+
+	if (cmd == IPC_SET) {
+		if (copy_shmid_from_user(&shmid64, buf, version))
+			return -EFAULT;
+	}
+
+	down_write(&shm_ids(ns).rwsem);
+	rcu_read_lock();
+
+	ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
+				      &shmid64.shm_perm, 0);
+	if (IS_ERR(ipcp)) {
+		err = PTR_ERR(ipcp);
+		goto out_unlock1;
+	}
+
+	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+
+	err = security_shm_shmctl(shp, cmd);
+	if (err)
+		goto out_unlock1;
+
+	switch (cmd) {
+	case IPC_RMID:
+		ipc_lock_object(&shp->shm_perm);
+		/* do_shm_rmid unlocks the ipc object and rcu */
+		do_shm_rmid(ns, ipcp);
+		goto out_up;
+	case IPC_SET:
+		ipc_lock_object(&shp->shm_perm);
+		err = ipc_update_perm(&shmid64.shm_perm, ipcp);
+		if (err)
+			goto out_unlock0;
+		shp->shm_ctim = get_seconds();
+		break;
+	default:
+		err = -EINVAL;
+		goto out_unlock1;
+	}
+
+out_unlock0:
+	ipc_unlock_object(&shp->shm_perm);
+out_unlock1:
+	rcu_read_unlock();
+out_up:
+	up_write(&shm_ids(ns).rwsem);
+	return err;
+}
+
+static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
+			 int cmd, int version, void __user *buf)
+{
+	int err;
+	struct shmid_kernel *shp;
+
+	/* preliminary security checks for *_INFO */
+	if (cmd == IPC_INFO || cmd == SHM_INFO) {
+		err = security_shm_shmctl(NULL, cmd);
+		if (err)
+			return err;
+	}
+
+	switch (cmd) {
+	case IPC_INFO:
+	{
+		struct shminfo64 shminfo;
+
+		memset(&shminfo, 0, sizeof(shminfo));
+		shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
+		shminfo.shmmax = ns->shm_ctlmax;
+		shminfo.shmall = ns->shm_ctlall;
+
+		shminfo.shmmin = SHMMIN;
+		if (copy_shminfo_to_user(buf, &shminfo, version))
+			return -EFAULT;
+
+		down_read(&shm_ids(ns).rwsem);
+		err = ipc_get_maxid(&shm_ids(ns));
+		up_read(&shm_ids(ns).rwsem);
+
+		if (err < 0)
+			err = 0;
+		goto out;
+	}
+	case SHM_INFO:
+	{
+		struct shm_info shm_info;
+
+		memset(&shm_info, 0, sizeof(shm_info));
+		down_read(&shm_ids(ns).rwsem);
+		shm_info.used_ids = shm_ids(ns).in_use;
+		shm_get_stat(ns, &shm_info.shm_rss, &shm_info.shm_swp);
+		shm_info.shm_tot = ns->shm_tot;
+		shm_info.swap_attempts = 0;
+		shm_info.swap_successes = 0;
+		err = ipc_get_maxid(&shm_ids(ns));
+		up_read(&shm_ids(ns).rwsem);
+		if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		err = err < 0 ? 0 : err;
+		goto out;
+	}
+	case SHM_STAT:
+	case IPC_STAT:
+	{
+		struct shmid64_ds tbuf;
+		int result;
+
+		rcu_read_lock();
+		if (cmd == SHM_STAT) {
+			shp = shm_obtain_object(ns, shmid);
+			if (IS_ERR(shp)) {
+				err = PTR_ERR(shp);
+				goto out_unlock;
+			}
+			result = shp->shm_perm.id;
+		} else {
+			shp = shm_obtain_object_check(ns, shmid);
+			if (IS_ERR(shp)) {
+				err = PTR_ERR(shp);
+				goto out_unlock;
+			}
+			result = 0;
+		}
+
+		err = -EACCES;
+		if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
+			goto out_unlock;
+
+		err = security_shm_shmctl(shp, cmd);
+		if (err)
+			goto out_unlock;
+
+		memset(&tbuf, 0, sizeof(tbuf));
+		kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
+		tbuf.shm_segsz	= shp->shm_segsz;
+		tbuf.shm_atime	= shp->shm_atim;
+		tbuf.shm_dtime	= shp->shm_dtim;
+		tbuf.shm_ctime	= shp->shm_ctim;
+		tbuf.shm_cpid	= shp->shm_cprid;
+		tbuf.shm_lpid	= shp->shm_lprid;
+		tbuf.shm_nattch	= shp->shm_nattch;
+		rcu_read_unlock();
+
+		if (copy_shmid_to_user(buf, &tbuf, version))
+			err = -EFAULT;
+		else
+			err = result;
+		goto out;
+	}
+	default:
+		return -EINVAL;
+	}
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	return err;
+}
+
+SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
+{
+	struct shmid_kernel *shp;
+	int err, version;
+	struct ipc_namespace *ns;
+
+	if (cmd < 0 || shmid < 0)
+		return -EINVAL;
+
+	version = ipc_parse_version(&cmd);
+	ns = current->nsproxy->ipc_ns;
+
+	switch (cmd) {
+	case IPC_INFO:
+	case SHM_INFO:
+	case SHM_STAT:
+	case IPC_STAT:
+		return shmctl_nolock(ns, shmid, cmd, version, buf);
+	case IPC_RMID:
+	case IPC_SET:
+		return shmctl_down(ns, shmid, cmd, buf, version);
+	case SHM_LOCK:
+	case SHM_UNLOCK:
+	{
+		struct file *shm_file;
+
+		rcu_read_lock();
+		shp = shm_obtain_object_check(ns, shmid);
+		if (IS_ERR(shp)) {
+			err = PTR_ERR(shp);
+			goto out_unlock1;
+		}
+
+		audit_ipc_obj(&(shp->shm_perm));
+		err = security_shm_shmctl(shp, cmd);
+		if (err)
+			goto out_unlock1;
+
+		ipc_lock_object(&shp->shm_perm);
+
+		/* check if shm_destroy() is tearing down shp */
+		if (!ipc_valid_object(&shp->shm_perm)) {
+			err = -EIDRM;
+			goto out_unlock0;
+		}
+
+		if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
+			kuid_t euid = current_euid();
+			if (!uid_eq(euid, shp->shm_perm.uid) &&
+			    !uid_eq(euid, shp->shm_perm.cuid)) {
+				err = -EPERM;
+				goto out_unlock0;
+			}
+			if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
+				err = -EPERM;
+				goto out_unlock0;
+			}
+		}
+
+		shm_file = shp->shm_file;
+		if (is_file_hugepages(shm_file))
+			goto out_unlock0;
+
+		if (cmd == SHM_LOCK) {
+			struct user_struct *user = current_user();
+			err = shmem_lock(shm_file, 1, user);
+			if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
+				shp->shm_perm.mode |= SHM_LOCKED;
+				shp->mlock_user = user;
+			}
+			goto out_unlock0;
+		}
+
+		/* SHM_UNLOCK */
+		if (!(shp->shm_perm.mode & SHM_LOCKED))
+			goto out_unlock0;
+		shmem_lock(shm_file, 0, shp->mlock_user);
+		shp->shm_perm.mode &= ~SHM_LOCKED;
+		shp->mlock_user = NULL;
+		get_file(shm_file);
+		ipc_unlock_object(&shp->shm_perm);
+		rcu_read_unlock();
+		shmem_unlock_mapping(shm_file->f_mapping);
+
+		fput(shm_file);
+		return err;
+	}
+	default:
+		return -EINVAL;
+	}
+
+out_unlock0:
+	ipc_unlock_object(&shp->shm_perm);
+out_unlock1:
+	rcu_read_unlock();
+	return err;
+}
+
+/*
+ * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
+ *
+ * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
+ * "raddr" thing points to kernel space, and there has to be a wrapper around
+ * this.
+ */
+long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
+	      unsigned long shmlba)
+{
+	struct shmid_kernel *shp;
+	unsigned long addr;
+	unsigned long size;
+	struct file *file;
+	int    err;
+	unsigned long flags;
+	unsigned long prot;
+	int acc_mode;
+	struct ipc_namespace *ns;
+	struct shm_file_data *sfd;
+	struct path path;
+	fmode_t f_mode;
+	unsigned long populate = 0;
+
+	err = -EINVAL;
+	if (shmid < 0)
+		goto out;
+	else if ((addr = (ulong)shmaddr)) {
+		if (addr & (shmlba - 1)) {
+			if (shmflg & SHM_RND)
+				addr &= ~(shmlba - 1);	   /* round down */
+			else
+#ifndef __ARCH_FORCE_SHMLBA
+				if (addr & ~PAGE_MASK)
+#endif
+					goto out;
+		}
+		flags = MAP_SHARED | MAP_FIXED;
+	} else {
+		if ((shmflg & SHM_REMAP))
+			goto out;
+
+		flags = MAP_SHARED;
+	}
+
+	if (shmflg & SHM_RDONLY) {
+		prot = PROT_READ;
+		acc_mode = S_IRUGO;
+		f_mode = FMODE_READ;
+	} else {
+		prot = PROT_READ | PROT_WRITE;
+		acc_mode = S_IRUGO | S_IWUGO;
+		f_mode = FMODE_READ | FMODE_WRITE;
+	}
+	if (shmflg & SHM_EXEC) {
+		prot |= PROT_EXEC;
+		acc_mode |= S_IXUGO;
+	}
+
+	/*
+	 * We cannot rely on the fs check since SYSV IPC does have an
+	 * additional creator id...
+	 */
+	ns = current->nsproxy->ipc_ns;
+	rcu_read_lock();
+	shp = shm_obtain_object_check(ns, shmid);
+	if (IS_ERR(shp)) {
+		err = PTR_ERR(shp);
+		goto out_unlock;
+	}
+
+	err = -EACCES;
+	if (ipcperms(ns, &shp->shm_perm, acc_mode))
+		goto out_unlock;
+
+	err = security_shm_shmat(shp, shmaddr, shmflg);
+	if (err)
+		goto out_unlock;
+
+	ipc_lock_object(&shp->shm_perm);
+
+	/* check if shm_destroy() is tearing down shp */
+	if (!ipc_valid_object(&shp->shm_perm)) {
+		ipc_unlock_object(&shp->shm_perm);
+		err = -EIDRM;
+		goto out_unlock;
+	}
+
+	path = shp->shm_file->f_path;
+	path_get(&path);
+	shp->shm_nattch++;
+	size = i_size_read(d_inode(path.dentry));
+	ipc_unlock_object(&shp->shm_perm);
+	rcu_read_unlock();
+
+	err = -ENOMEM;
+	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
+	if (!sfd) {
+		path_put(&path);
+		goto out_nattch;
+	}
+
+	file = alloc_file(&path, f_mode,
+			  is_file_hugepages(shp->shm_file) ?
+				&shm_file_operations_huge :
+				&shm_file_operations);
+	err = PTR_ERR(file);
+	if (IS_ERR(file)) {
+		kfree(sfd);
+		path_put(&path);
+		goto out_nattch;
+	}
+
+	file->private_data = sfd;
+	file->f_mapping = shp->shm_file->f_mapping;
+	sfd->id = shp->shm_perm.id;
+	sfd->ns = get_ipc_ns(ns);
+	sfd->file = shp->shm_file;
+	sfd->vm_ops = NULL;
+
+	err = security_mmap_file(file, prot, flags);
+	if (err)
+		goto out_fput;
+
+	down_write(&current->mm->mmap_sem);
+	if (addr && !(shmflg & SHM_REMAP)) {
+		err = -EINVAL;
+		if (addr + size < addr)
+			goto invalid;
+
+		if (find_vma_intersection(current->mm, addr, addr + size))
+			goto invalid;
+	}
+
+	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
+	*raddr = addr;
+	err = 0;
+	if (IS_ERR_VALUE(addr))
+		err = (long)addr;
+invalid:
+	up_write(&current->mm->mmap_sem);
+	if (populate)
+		mm_populate(addr, populate);
+
+out_fput:
+	fput(file);
+
+out_nattch:
+	down_write(&shm_ids(ns).rwsem);
+	shp = shm_lock(ns, shmid);
+	BUG_ON(IS_ERR(shp));
+	shp->shm_nattch--;
+	if (shm_may_destroy(ns, shp))
+		shm_destroy(ns, shp);
+	else
+		shm_unlock(shp);
+	up_write(&shm_ids(ns).rwsem);
+	return err;
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	return err;
+}
+
+SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
+{
+	unsigned long ret;
+	long err;
+
+	err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
+	if (err)
+		return err;
+	force_successful_syscall_return();
+	return (long)ret;
+}
+
+/*
+ * detach and kill segment if marked destroyed.
+ * The work is done in shm_close.
+ */
+SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long addr = (unsigned long)shmaddr;
+	int retval = -EINVAL;
+#ifdef CONFIG_MMU
+	loff_t size = 0;
+	struct file *file;
+	struct vm_area_struct *next;
+#endif
+
+	if (addr & ~PAGE_MASK)
+		return retval;
+
+	down_write(&mm->mmap_sem);
+
+	/*
+	 * This function tries to be smart and unmap shm segments that
+	 * were modified by partial mlock or munmap calls:
+	 * - It first determines the size of the shm segment that should be
+	 *   unmapped: It searches for a vma that is backed by shm and that
+	 *   started at address shmaddr. It records it's size and then unmaps
+	 *   it.
+	 * - Then it unmaps all shm vmas that started at shmaddr and that
+	 *   are within the initially determined size and that are from the
+	 *   same shm segment from which we determined the size.
+	 * Errors from do_munmap are ignored: the function only fails if
+	 * it's called with invalid parameters or if it's called to unmap
+	 * a part of a vma. Both calls in this function are for full vmas,
+	 * the parameters are directly copied from the vma itself and always
+	 * valid - therefore do_munmap cannot fail. (famous last words?)
+	 */
+	/*
+	 * If it had been mremap()'d, the starting address would not
+	 * match the usual checks anyway. So assume all vma's are
+	 * above the starting address given.
+	 */
+	vma = find_vma(mm, addr);
+
+#ifdef CONFIG_MMU
+	while (vma) {
+		next = vma->vm_next;
+
+		/*
+		 * Check if the starting address would match, i.e. it's
+		 * a fragment created by mprotect() and/or munmap(), or it
+		 * otherwise it starts at this address with no hassles.
+		 */
+		if ((vma->vm_ops == &shm_vm_ops) &&
+			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
+
+			/*
+			 * Record the file of the shm segment being
+			 * unmapped.  With mremap(), someone could place
+			 * page from another segment but with equal offsets
+			 * in the range we are unmapping.
+			 */
+			file = vma->vm_file;
+			size = i_size_read(file_inode(vma->vm_file));
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+			/*
+			 * We discovered the size of the shm segment, so
+			 * break out of here and fall through to the next
+			 * loop that uses the size information to stop
+			 * searching for matching vma's.
+			 */
+			retval = 0;
+			vma = next;
+			break;
+		}
+		vma = next;
+	}
+
+	/*
+	 * We need look no further than the maximum address a fragment
+	 * could possibly have landed at. Also cast things to loff_t to
+	 * prevent overflows and make comparisons vs. equal-width types.
+	 */
+	size = PAGE_ALIGN(size);
+	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
+		next = vma->vm_next;
+
+		/* finding a matching vma now does not alter retval */
+		if ((vma->vm_ops == &shm_vm_ops) &&
+		    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
+		    (vma->vm_file == file))
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+		vma = next;
+	}
+
+#else /* CONFIG_MMU */
+	/* under NOMMU conditions, the exact address to be destroyed must be
+	 * given */
+	if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
+		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+		retval = 0;
+	}
+
+#endif
+
+	up_write(&mm->mmap_sem);
+	return retval;
+}
+
+#ifdef CONFIG_PROC_FS
+static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
+{
+	struct user_namespace *user_ns = seq_user_ns(s);
+	struct shmid_kernel *shp = it;
+	unsigned long rss = 0, swp = 0;
+
+	shm_add_rss_swap(shp, &rss, &swp);
+
+#if BITS_PER_LONG <= 32
+#define SIZE_SPEC "%10lu"
+#else
+#define SIZE_SPEC "%21lu"
+#endif
+
+	seq_printf(s,
+		   "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
+		   "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
+		   SIZE_SPEC " " SIZE_SPEC "\n",
+		   shp->shm_perm.key,
+		   shp->shm_perm.id,
+		   shp->shm_perm.mode,
+		   shp->shm_segsz,
+		   shp->shm_cprid,
+		   shp->shm_lprid,
+		   shp->shm_nattch,
+		   from_kuid_munged(user_ns, shp->shm_perm.uid),
+		   from_kgid_munged(user_ns, shp->shm_perm.gid),
+		   from_kuid_munged(user_ns, shp->shm_perm.cuid),
+		   from_kgid_munged(user_ns, shp->shm_perm.cgid),
+		   shp->shm_atim,
+		   shp->shm_dtim,
+		   shp->shm_ctim,
+		   rss * PAGE_SIZE,
+		   swp * PAGE_SIZE);
+
+	return 0;
+}
+#endif
diff --git a/ipc/syscall.c b/ipc/syscall.c
new file mode 100644
index 000000000..52429489c
--- /dev/null
+++ b/ipc/syscall.c
@@ -0,0 +1,99 @@
+/*
+ * sys_ipc() is the old de-multiplexer for the SysV IPC calls.
+ *
+ * This is really horribly ugly, and new architectures should just wire up
+ * the individual syscalls instead.
+ */
+#include <linux/unistd.h>
+
+#ifdef __ARCH_WANT_SYS_IPC
+#include <linux/errno.h>
+#include <linux/ipc.h>
+#include <linux/shm.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second,
+		unsigned long, third, void __user *, ptr, long, fifth)
+{
+	int version, ret;
+
+	version = call >> 16; /* hack for backward compatibility */
+	call &= 0xffff;
+
+	switch (call) {
+	case SEMOP:
+		return sys_semtimedop(first, (struct sembuf __user *)ptr,
+				      second, NULL);
+	case SEMTIMEDOP:
+		return sys_semtimedop(first, (struct sembuf __user *)ptr,
+				      second,
+				      (const struct timespec __user *)fifth);
+
+	case SEMGET:
+		return sys_semget(first, second, third);
+	case SEMCTL: {
+		unsigned long arg;
+		if (!ptr)
+			return -EINVAL;
+		if (get_user(arg, (unsigned long __user *) ptr))
+			return -EFAULT;
+		return sys_semctl(first, second, third, arg);
+	}
+
+	case MSGSND:
+		return sys_msgsnd(first, (struct msgbuf __user *) ptr,
+				  second, third);
+	case MSGRCV:
+		switch (version) {
+		case 0: {
+			struct ipc_kludge tmp;
+			if (!ptr)
+				return -EINVAL;
+
+			if (copy_from_user(&tmp,
+					   (struct ipc_kludge __user *) ptr,
+					   sizeof(tmp)))
+				return -EFAULT;
+			return sys_msgrcv(first, tmp.msgp, second,
+					   tmp.msgtyp, third);
+		}
+		default:
+			return sys_msgrcv(first,
+					   (struct msgbuf __user *) ptr,
+					   second, fifth, third);
+		}
+	case MSGGET:
+		return sys_msgget((key_t) first, second);
+	case MSGCTL:
+		return sys_msgctl(first, second, (struct msqid_ds __user *)ptr);
+
+	case SHMAT:
+		switch (version) {
+		default: {
+			unsigned long raddr;
+			ret = do_shmat(first, (char __user *)ptr,
+				       second, &raddr, SHMLBA);
+			if (ret)
+				return ret;
+			return put_user(raddr, (unsigned long __user *) third);
+		}
+		case 1:
+			/*
+			 * This was the entry point for kernel-originating calls
+			 * from iBCS2 in 2.2 days.
+			 */
+			return -EINVAL;
+		}
+	case SHMDT:
+		return sys_shmdt((char __user *)ptr);
+	case SHMGET:
+		return sys_shmget(first, second, third);
+	case SHMCTL:
+		return sys_shmctl(first, second,
+				   (struct shmid_ds __user *) ptr);
+	default:
+		return -ENOSYS;
+	}
+}
+#endif
diff --git a/ipc/util.c b/ipc/util.c
new file mode 100644
index 000000000..ff3323ef8
--- /dev/null
+++ b/ipc/util.c
@@ -0,0 +1,883 @@
+/*
+ * linux/ipc/util.c
+ * Copyright (C) 1992 Krishna Balasubramanian
+ *
+ * Sep 1997 - Call suser() last after "normal" permission checks so we
+ *            get BSD style process accounting right.
+ *            Occurs in several places in the IPC code.
+ *            Chris Evans, <chris@ferret.lmh.ox.ac.uk>
+ * Nov 1999 - ipc helper functions, unified SMP locking
+ *	      Manfred Spraul <manfred@colorfullife.com>
+ * Oct 2002 - One lock per IPC id. RCU ipc_free for lock-free grow_ary().
+ *            Mingming Cao <cmm@us.ibm.com>
+ * Mar 2006 - support for audit of ipc object properties
+ *            Dustin Kirkland <dustin.kirkland@us.ibm.com>
+ * Jun 2006 - namespaces ssupport
+ *            OpenVZ, SWsoft Inc.
+ *            Pavel Emelianov <xemul@openvz.org>
+ *
+ * General sysv ipc locking scheme:
+ *	rcu_read_lock()
+ *          obtain the ipc object (kern_ipc_perm) by looking up the id in an idr
+ *	    tree.
+ *	    - perform initial checks (capabilities, auditing and permission,
+ *	      etc).
+ *	    - perform read-only operations, such as STAT, INFO commands.
+ *	      acquire the ipc lock (kern_ipc_perm.lock) through
+ *	      ipc_lock_object()
+ *		- perform data updates, such as SET, RMID commands and
+ *		  mechanism-specific operations (semop/semtimedop,
+ *		  msgsnd/msgrcv, shmat/shmdt).
+ *	    drop the ipc lock, through ipc_unlock_object().
+ *	rcu_read_unlock()
+ *
+ *  The ids->rwsem must be taken when:
+ *	- creating, removing and iterating the existing entries in ipc
+ *	  identifier sets.
+ *	- iterating through files under /proc/sysvipc/
+ *
+ *  Note that sems have a special fast path that avoids kern_ipc_perm.lock -
+ *  see sem_lock().
+ */
+
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/init.h>
+#include <linux/msg.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/capability.h>
+#include <linux/highuid.h>
+#include <linux/security.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/audit.h>
+#include <linux/nsproxy.h>
+#include <linux/rwsem.h>
+#include <linux/memory.h>
+#include <linux/ipc_namespace.h>
+
+#include <asm/unistd.h>
+
+#include "util.h"
+
+struct ipc_proc_iface {
+	const char *path;
+	const char *header;
+	int ids;
+	int (*show)(struct seq_file *, void *);
+};
+
+/**
+ * ipc_init - initialise ipc subsystem
+ *
+ * The various sysv ipc resources (semaphores, messages and shared
+ * memory) are initialised.
+ *
+ * A callback routine is registered into the memory hotplug notifier
+ * chain: since msgmni scales to lowmem this callback routine will be
+ * called upon successful memory add / remove to recompute msmgni.
+ */
+static int __init ipc_init(void)
+{
+	sem_init();
+	msg_init();
+	shm_init();
+	return 0;
+}
+device_initcall(ipc_init);
+
+/**
+ * ipc_init_ids	- initialise ipc identifiers
+ * @ids: ipc identifier set
+ *
+ * Set up the sequence range to use for the ipc identifier range (limited
+ * below IPCMNI) then initialise the ids idr.
+ */
+void ipc_init_ids(struct ipc_ids *ids)
+{
+	ids->in_use = 0;
+	ids->seq = 0;
+	ids->next_id = -1;
+	init_rwsem(&ids->rwsem);
+	idr_init(&ids->ipcs_idr);
+}
+
+#ifdef CONFIG_PROC_FS
+static const struct file_operations sysvipc_proc_fops;
+/**
+ * ipc_init_proc_interface -  create a proc interface for sysipc types using a seq_file interface.
+ * @path: Path in procfs
+ * @header: Banner to be printed at the beginning of the file.
+ * @ids: ipc id table to iterate.
+ * @show: show routine.
+ */
+void __init ipc_init_proc_interface(const char *path, const char *header,
+		int ids, int (*show)(struct seq_file *, void *))
+{
+	struct proc_dir_entry *pde;
+	struct ipc_proc_iface *iface;
+
+	iface = kmalloc(sizeof(*iface), GFP_KERNEL);
+	if (!iface)
+		return;
+	iface->path	= path;
+	iface->header	= header;
+	iface->ids	= ids;
+	iface->show	= show;
+
+	pde = proc_create_data(path,
+			       S_IRUGO,        /* world readable */
+			       NULL,           /* parent dir */
+			       &sysvipc_proc_fops,
+			       iface);
+	if (!pde)
+		kfree(iface);
+}
+#endif
+
+/**
+ * ipc_findkey	- find a key in an ipc identifier set
+ * @ids: ipc identifier set
+ * @key: key to find
+ *
+ * Returns the locked pointer to the ipc structure if found or NULL
+ * otherwise. If key is found ipc points to the owning ipc structure
+ *
+ * Called with ipc_ids.rwsem held.
+ */
+static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
+{
+	struct kern_ipc_perm *ipc;
+	int next_id;
+	int total;
+
+	for (total = 0, next_id = 0; total < ids->in_use; next_id++) {
+		ipc = idr_find(&ids->ipcs_idr, next_id);
+
+		if (ipc == NULL)
+			continue;
+
+		if (ipc->key != key) {
+			total++;
+			continue;
+		}
+
+		rcu_read_lock();
+		ipc_lock_object(ipc);
+		return ipc;
+	}
+
+	return NULL;
+}
+
+/**
+ * ipc_get_maxid - get the last assigned id
+ * @ids: ipc identifier set
+ *
+ * Called with ipc_ids.rwsem held.
+ */
+int ipc_get_maxid(struct ipc_ids *ids)
+{
+	struct kern_ipc_perm *ipc;
+	int max_id = -1;
+	int total, id;
+
+	if (ids->in_use == 0)
+		return -1;
+
+	if (ids->in_use == IPCMNI)
+		return IPCMNI - 1;
+
+	/* Look for the last assigned id */
+	total = 0;
+	for (id = 0; id < IPCMNI && total < ids->in_use; id++) {
+		ipc = idr_find(&ids->ipcs_idr, id);
+		if (ipc != NULL) {
+			max_id = id;
+			total++;
+		}
+	}
+	return max_id;
+}
+
+/**
+ * ipc_addid - add an ipc identifier
+ * @ids: ipc identifier set
+ * @new: new ipc permission set
+ * @size: limit for the number of used ids
+ *
+ * Add an entry 'new' to the ipc ids idr. The permissions object is
+ * initialised and the first free entry is set up and the id assigned
+ * is returned. The 'new' entry is returned in a locked state on success.
+ * On failure the entry is not locked and a negative err-code is returned.
+ *
+ * Called with writer ipc_ids.rwsem held.
+ */
+int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
+{
+	kuid_t euid;
+	kgid_t egid;
+	int id;
+	int next_id = ids->next_id;
+
+	if (size > IPCMNI)
+		size = IPCMNI;
+
+	if (ids->in_use >= size)
+		return -ENOSPC;
+
+	idr_preload(GFP_KERNEL);
+
+	spin_lock_init(&new->lock);
+	new->deleted = false;
+	rcu_read_lock();
+	spin_lock(&new->lock);
+
+	id = idr_alloc(&ids->ipcs_idr, new,
+		       (next_id < 0) ? 0 : ipcid_to_idx(next_id), 0,
+		       GFP_NOWAIT);
+	idr_preload_end();
+	if (id < 0) {
+		spin_unlock(&new->lock);
+		rcu_read_unlock();
+		return id;
+	}
+
+	ids->in_use++;
+
+	current_euid_egid(&euid, &egid);
+	new->cuid = new->uid = euid;
+	new->gid = new->cgid = egid;
+
+	if (next_id < 0) {
+		new->seq = ids->seq++;
+		if (ids->seq > IPCID_SEQ_MAX)
+			ids->seq = 0;
+	} else {
+		new->seq = ipcid_to_seqx(next_id);
+		ids->next_id = -1;
+	}
+
+	new->id = ipc_buildid(id, new->seq);
+	return id;
+}
+
+/**
+ * ipcget_new -	create a new ipc object
+ * @ns: ipc namespace
+ * @ids: ipc identifier set
+ * @ops: the actual creation routine to call
+ * @params: its parameters
+ *
+ * This routine is called by sys_msgget, sys_semget() and sys_shmget()
+ * when the key is IPC_PRIVATE.
+ */
+static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
+		const struct ipc_ops *ops, struct ipc_params *params)
+{
+	int err;
+
+	down_write(&ids->rwsem);
+	err = ops->getnew(ns, params);
+	up_write(&ids->rwsem);
+	return err;
+}
+
+/**
+ * ipc_check_perms - check security and permissions for an ipc object
+ * @ns: ipc namespace
+ * @ipcp: ipc permission set
+ * @ops: the actual security routine to call
+ * @params: its parameters
+ *
+ * This routine is called by sys_msgget(), sys_semget() and sys_shmget()
+ * when the key is not IPC_PRIVATE and that key already exists in the
+ * ds IDR.
+ *
+ * On success, the ipc id is returned.
+ *
+ * It is called with ipc_ids.rwsem and ipcp->lock held.
+ */
+static int ipc_check_perms(struct ipc_namespace *ns,
+			   struct kern_ipc_perm *ipcp,
+			   const struct ipc_ops *ops,
+			   struct ipc_params *params)
+{
+	int err;
+
+	if (ipcperms(ns, ipcp, params->flg))
+		err = -EACCES;
+	else {
+		err = ops->associate(ipcp, params->flg);
+		if (!err)
+			err = ipcp->id;
+	}
+
+	return err;
+}
+
+/**
+ * ipcget_public - get an ipc object or create a new one
+ * @ns: ipc namespace
+ * @ids: ipc identifier set
+ * @ops: the actual creation routine to call
+ * @params: its parameters
+ *
+ * This routine is called by sys_msgget, sys_semget() and sys_shmget()
+ * when the key is not IPC_PRIVATE.
+ * It adds a new entry if the key is not found and does some permission
+ * / security checkings if the key is found.
+ *
+ * On success, the ipc id is returned.
+ */
+static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
+		const struct ipc_ops *ops, struct ipc_params *params)
+{
+	struct kern_ipc_perm *ipcp;
+	int flg = params->flg;
+	int err;
+
+	/*
+	 * Take the lock as a writer since we are potentially going to add
+	 * a new entry + read locks are not "upgradable"
+	 */
+	down_write(&ids->rwsem);
+	ipcp = ipc_findkey(ids, params->key);
+	if (ipcp == NULL) {
+		/* key not used */
+		if (!(flg & IPC_CREAT))
+			err = -ENOENT;
+		else
+			err = ops->getnew(ns, params);
+	} else {
+		/* ipc object has been locked by ipc_findkey() */
+
+		if (flg & IPC_CREAT && flg & IPC_EXCL)
+			err = -EEXIST;
+		else {
+			err = 0;
+			if (ops->more_checks)
+				err = ops->more_checks(ipcp, params);
+			if (!err)
+				/*
+				 * ipc_check_perms returns the IPC id on
+				 * success
+				 */
+				err = ipc_check_perms(ns, ipcp, ops, params);
+		}
+		ipc_unlock(ipcp);
+	}
+	up_write(&ids->rwsem);
+
+	return err;
+}
+
+
+/**
+ * ipc_rmid - remove an ipc identifier
+ * @ids: ipc identifier set
+ * @ipcp: ipc perm structure containing the identifier to remove
+ *
+ * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
+ * before this function is called, and remain locked on the exit.
+ */
+void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
+{
+	int lid = ipcid_to_idx(ipcp->id);
+
+	idr_remove(&ids->ipcs_idr, lid);
+	ids->in_use--;
+	ipcp->deleted = true;
+}
+
+/**
+ * ipc_alloc -	allocate ipc space
+ * @size: size desired
+ *
+ * Allocate memory from the appropriate pools and return a pointer to it.
+ * NULL is returned if the allocation fails
+ */
+void *ipc_alloc(int size)
+{
+	void *out;
+	if (size > PAGE_SIZE)
+		out = vmalloc(size);
+	else
+		out = kmalloc(size, GFP_KERNEL);
+	return out;
+}
+
+/**
+ * ipc_free - free ipc space
+ * @ptr: pointer returned by ipc_alloc
+ * @size: size of block
+ *
+ * Free a block created with ipc_alloc(). The caller must know the size
+ * used in the allocation call.
+ */
+void ipc_free(void *ptr, int size)
+{
+	if (size > PAGE_SIZE)
+		vfree(ptr);
+	else
+		kfree(ptr);
+}
+
+/**
+ * ipc_rcu_alloc - allocate ipc and rcu space
+ * @size: size desired
+ *
+ * Allocate memory for the rcu header structure +  the object.
+ * Returns the pointer to the object or NULL upon failure.
+ */
+void *ipc_rcu_alloc(int size)
+{
+	/*
+	 * We prepend the allocation with the rcu struct
+	 */
+	struct ipc_rcu *out = ipc_alloc(sizeof(struct ipc_rcu) + size);
+	if (unlikely(!out))
+		return NULL;
+	atomic_set(&out->refcount, 1);
+	return out + 1;
+}
+
+int ipc_rcu_getref(void *ptr)
+{
+	struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
+
+	return atomic_inc_not_zero(&p->refcount);
+}
+
+void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head))
+{
+	struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
+
+	if (!atomic_dec_and_test(&p->refcount))
+		return;
+
+	call_rcu(&p->rcu, func);
+}
+
+void ipc_rcu_free(struct rcu_head *head)
+{
+	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
+
+	if (is_vmalloc_addr(p))
+		vfree(p);
+	else
+		kfree(p);
+}
+
+/**
+ * ipcperms - check ipc permissions
+ * @ns: ipc namespace
+ * @ipcp: ipc permission set
+ * @flag: desired permission set
+ *
+ * Check user, group, other permissions for access
+ * to ipc resources. return 0 if allowed
+ *
+ * @flag will most probably be 0 or S_...UGO from <linux/stat.h>
+ */
+int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag)
+{
+	kuid_t euid = current_euid();
+	int requested_mode, granted_mode;
+
+	audit_ipc_obj(ipcp);
+	requested_mode = (flag >> 6) | (flag >> 3) | flag;
+	granted_mode = ipcp->mode;
+	if (uid_eq(euid, ipcp->cuid) ||
+	    uid_eq(euid, ipcp->uid))
+		granted_mode >>= 6;
+	else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid))
+		granted_mode >>= 3;
+	/* is there some bit set in requested_mode but not in granted_mode? */
+	if ((requested_mode & ~granted_mode & 0007) &&
+	    !ns_capable(ns->user_ns, CAP_IPC_OWNER))
+		return -1;
+
+	return security_ipc_permission(ipcp, flag);
+}
+
+/*
+ * Functions to convert between the kern_ipc_perm structure and the
+ * old/new ipc_perm structures
+ */
+
+/**
+ * kernel_to_ipc64_perm	- convert kernel ipc permissions to user
+ * @in: kernel permissions
+ * @out: new style ipc permissions
+ *
+ * Turn the kernel object @in into a set of permissions descriptions
+ * for returning to userspace (@out).
+ */
+void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out)
+{
+	out->key	= in->key;
+	out->uid	= from_kuid_munged(current_user_ns(), in->uid);
+	out->gid	= from_kgid_munged(current_user_ns(), in->gid);
+	out->cuid	= from_kuid_munged(current_user_ns(), in->cuid);
+	out->cgid	= from_kgid_munged(current_user_ns(), in->cgid);
+	out->mode	= in->mode;
+	out->seq	= in->seq;
+}
+
+/**
+ * ipc64_perm_to_ipc_perm - convert new ipc permissions to old
+ * @in: new style ipc permissions
+ * @out: old style ipc permissions
+ *
+ * Turn the new style permissions object @in into a compatibility
+ * object and store it into the @out pointer.
+ */
+void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
+{
+	out->key	= in->key;
+	SET_UID(out->uid, in->uid);
+	SET_GID(out->gid, in->gid);
+	SET_UID(out->cuid, in->cuid);
+	SET_GID(out->cgid, in->cgid);
+	out->mode	= in->mode;
+	out->seq	= in->seq;
+}
+
+/**
+ * ipc_obtain_object
+ * @ids: ipc identifier set
+ * @id: ipc id to look for
+ *
+ * Look for an id in the ipc ids idr and return associated ipc object.
+ *
+ * Call inside the RCU critical section.
+ * The ipc object is *not* locked on exit.
+ */
+struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id)
+{
+	struct kern_ipc_perm *out;
+	int lid = ipcid_to_idx(id);
+
+	out = idr_find(&ids->ipcs_idr, lid);
+	if (!out)
+		return ERR_PTR(-EINVAL);
+
+	return out;
+}
+
+/**
+ * ipc_lock - lock an ipc structure without rwsem held
+ * @ids: ipc identifier set
+ * @id: ipc id to look for
+ *
+ * Look for an id in the ipc ids idr and lock the associated ipc object.
+ *
+ * The ipc object is locked on successful exit.
+ */
+struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
+{
+	struct kern_ipc_perm *out;
+
+	rcu_read_lock();
+	out = ipc_obtain_object(ids, id);
+	if (IS_ERR(out))
+		goto err1;
+
+	spin_lock(&out->lock);
+
+	/* ipc_rmid() may have already freed the ID while ipc_lock
+	 * was spinning: here verify that the structure is still valid
+	 */
+	if (ipc_valid_object(out))
+		return out;
+
+	spin_unlock(&out->lock);
+	out = ERR_PTR(-EINVAL);
+err1:
+	rcu_read_unlock();
+	return out;
+}
+
+/**
+ * ipc_obtain_object_check
+ * @ids: ipc identifier set
+ * @id: ipc id to look for
+ *
+ * Similar to ipc_obtain_object() but also checks
+ * the ipc object reference counter.
+ *
+ * Call inside the RCU critical section.
+ * The ipc object is *not* locked on exit.
+ */
+struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id)
+{
+	struct kern_ipc_perm *out = ipc_obtain_object(ids, id);
+
+	if (IS_ERR(out))
+		goto out;
+
+	if (ipc_checkid(out, id))
+		return ERR_PTR(-EIDRM);
+out:
+	return out;
+}
+
+/**
+ * ipcget - Common sys_*get() code
+ * @ns: namespace
+ * @ids: ipc identifier set
+ * @ops: operations to be called on ipc object creation, permission checks
+ *       and further checks
+ * @params: the parameters needed by the previous operations.
+ *
+ * Common routine called by sys_msgget(), sys_semget() and sys_shmget().
+ */
+int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
+			const struct ipc_ops *ops, struct ipc_params *params)
+{
+	if (params->key == IPC_PRIVATE)
+		return ipcget_new(ns, ids, ops, params);
+	else
+		return ipcget_public(ns, ids, ops, params);
+}
+
+/**
+ * ipc_update_perm - update the permissions of an ipc object
+ * @in:  the permission given as input.
+ * @out: the permission of the ipc to set.
+ */
+int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
+{
+	kuid_t uid = make_kuid(current_user_ns(), in->uid);
+	kgid_t gid = make_kgid(current_user_ns(), in->gid);
+	if (!uid_valid(uid) || !gid_valid(gid))
+		return -EINVAL;
+
+	out->uid = uid;
+	out->gid = gid;
+	out->mode = (out->mode & ~S_IRWXUGO)
+		| (in->mode & S_IRWXUGO);
+
+	return 0;
+}
+
+/**
+ * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd
+ * @ns:  ipc namespace
+ * @ids:  the table of ids where to look for the ipc
+ * @id:   the id of the ipc to retrieve
+ * @cmd:  the cmd to check
+ * @perm: the permission to set
+ * @extra_perm: one extra permission parameter used by msq
+ *
+ * This function does some common audit and permissions check for some IPC_XXX
+ * cmd and is called from semctl_down, shmctl_down and msgctl_down.
+ * It must be called without any lock held and
+ *  - retrieves the ipc with the given id in the given table.
+ *  - performs some audit and permission check, depending on the given cmd
+ *  - returns a pointer to the ipc object or otherwise, the corresponding error.
+ *
+ * Call holding the both the rwsem and the rcu read lock.
+ */
+struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
+					struct ipc_ids *ids, int id, int cmd,
+					struct ipc64_perm *perm, int extra_perm)
+{
+	kuid_t euid;
+	int err = -EPERM;
+	struct kern_ipc_perm *ipcp;
+
+	ipcp = ipc_obtain_object_check(ids, id);
+	if (IS_ERR(ipcp)) {
+		err = PTR_ERR(ipcp);
+		goto err;
+	}
+
+	audit_ipc_obj(ipcp);
+	if (cmd == IPC_SET)
+		audit_ipc_set_perm(extra_perm, perm->uid,
+				   perm->gid, perm->mode);
+
+	euid = current_euid();
+	if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid)  ||
+	    ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+		return ipcp; /* successful lookup */
+err:
+	return ERR_PTR(err);
+}
+
+#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
+
+
+/**
+ * ipc_parse_version - ipc call version
+ * @cmd: pointer to command
+ *
+ * Return IPC_64 for new style IPC and IPC_OLD for old style IPC.
+ * The @cmd value is turned from an encoding command and version into
+ * just the command code.
+ */
+int ipc_parse_version(int *cmd)
+{
+	if (*cmd & IPC_64) {
+		*cmd ^= IPC_64;
+		return IPC_64;
+	} else {
+		return IPC_OLD;
+	}
+}
+
+#endif /* CONFIG_ARCH_WANT_IPC_PARSE_VERSION */
+
+#ifdef CONFIG_PROC_FS
+struct ipc_proc_iter {
+	struct ipc_namespace *ns;
+	struct ipc_proc_iface *iface;
+};
+
+/*
+ * This routine locks the ipc structure found at least at position pos.
+ */
+static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
+					      loff_t *new_pos)
+{
+	struct kern_ipc_perm *ipc;
+	int total, id;
+
+	total = 0;
+	for (id = 0; id < pos && total < ids->in_use; id++) {
+		ipc = idr_find(&ids->ipcs_idr, id);
+		if (ipc != NULL)
+			total++;
+	}
+
+	if (total >= ids->in_use)
+		return NULL;
+
+	for (; pos < IPCMNI; pos++) {
+		ipc = idr_find(&ids->ipcs_idr, pos);
+		if (ipc != NULL) {
+			*new_pos = pos + 1;
+			rcu_read_lock();
+			ipc_lock_object(ipc);
+			return ipc;
+		}
+	}
+
+	/* Out of range - return NULL to terminate iteration */
+	return NULL;
+}
+
+static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos)
+{
+	struct ipc_proc_iter *iter = s->private;
+	struct ipc_proc_iface *iface = iter->iface;
+	struct kern_ipc_perm *ipc = it;
+
+	/* If we had an ipc id locked before, unlock it */
+	if (ipc && ipc != SEQ_START_TOKEN)
+		ipc_unlock(ipc);
+
+	return sysvipc_find_ipc(&iter->ns->ids[iface->ids], *pos, pos);
+}
+
+/*
+ * File positions: pos 0 -> header, pos n -> ipc id = n - 1.
+ * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START.
+ */
+static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos)
+{
+	struct ipc_proc_iter *iter = s->private;
+	struct ipc_proc_iface *iface = iter->iface;
+	struct ipc_ids *ids;
+
+	ids = &iter->ns->ids[iface->ids];
+
+	/*
+	 * Take the lock - this will be released by the corresponding
+	 * call to stop().
+	 */
+	down_read(&ids->rwsem);
+
+	/* pos < 0 is invalid */
+	if (*pos < 0)
+		return NULL;
+
+	/* pos == 0 means header */
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	/* Find the (pos-1)th ipc */
+	return sysvipc_find_ipc(ids, *pos - 1, pos);
+}
+
+static void sysvipc_proc_stop(struct seq_file *s, void *it)
+{
+	struct kern_ipc_perm *ipc = it;
+	struct ipc_proc_iter *iter = s->private;
+	struct ipc_proc_iface *iface = iter->iface;
+	struct ipc_ids *ids;
+
+	/* If we had a locked structure, release it */
+	if (ipc && ipc != SEQ_START_TOKEN)
+		ipc_unlock(ipc);
+
+	ids = &iter->ns->ids[iface->ids];
+	/* Release the lock we took in start() */
+	up_read(&ids->rwsem);
+}
+
+static int sysvipc_proc_show(struct seq_file *s, void *it)
+{
+	struct ipc_proc_iter *iter = s->private;
+	struct ipc_proc_iface *iface = iter->iface;
+
+	if (it == SEQ_START_TOKEN) {
+		seq_puts(s, iface->header);
+		return 0;
+	}
+
+	return iface->show(s, it);
+}
+
+static const struct seq_operations sysvipc_proc_seqops = {
+	.start = sysvipc_proc_start,
+	.stop  = sysvipc_proc_stop,
+	.next  = sysvipc_proc_next,
+	.show  = sysvipc_proc_show,
+};
+
+static int sysvipc_proc_open(struct inode *inode, struct file *file)
+{
+	struct ipc_proc_iter *iter;
+
+	iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter));
+	if (!iter)
+		return -ENOMEM;
+
+	iter->iface = PDE_DATA(inode);
+	iter->ns    = get_ipc_ns(current->nsproxy->ipc_ns);
+
+	return 0;
+}
+
+static int sysvipc_proc_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct ipc_proc_iter *iter = seq->private;
+	put_ipc_ns(iter->ns);
+	return seq_release_private(inode, file);
+}
+
+static const struct file_operations sysvipc_proc_fops = {
+	.open    = sysvipc_proc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = sysvipc_proc_release,
+};
+#endif /* CONFIG_PROC_FS */
diff --git a/ipc/util.h b/ipc/util.h
new file mode 100644
index 000000000..1a5a0fcd0
--- /dev/null
+++ b/ipc/util.h
@@ -0,0 +1,207 @@
+/*
+ * linux/ipc/util.h
+ * Copyright (C) 1999 Christoph Rohland
+ *
+ * ipc helper functions (c) 1999 Manfred Spraul <manfred@colorfullife.com>
+ * namespaces support.      2006 OpenVZ, SWsoft Inc.
+ *                               Pavel Emelianov <xemul@openvz.org>
+ */
+
+#ifndef _IPC_UTIL_H
+#define _IPC_UTIL_H
+
+#include <linux/unistd.h>
+#include <linux/err.h>
+
+#define SEQ_MULTIPLIER	(IPCMNI)
+
+void sem_init(void);
+void msg_init(void);
+void shm_init(void);
+
+struct ipc_namespace;
+
+#ifdef CONFIG_POSIX_MQUEUE
+extern void mq_clear_sbinfo(struct ipc_namespace *ns);
+extern void mq_put_mnt(struct ipc_namespace *ns);
+#else
+static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
+static inline void mq_put_mnt(struct ipc_namespace *ns) { }
+#endif
+
+#ifdef CONFIG_SYSVIPC
+void sem_init_ns(struct ipc_namespace *ns);
+void msg_init_ns(struct ipc_namespace *ns);
+void shm_init_ns(struct ipc_namespace *ns);
+
+void sem_exit_ns(struct ipc_namespace *ns);
+void msg_exit_ns(struct ipc_namespace *ns);
+void shm_exit_ns(struct ipc_namespace *ns);
+#else
+static inline void sem_init_ns(struct ipc_namespace *ns) { }
+static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline void shm_init_ns(struct ipc_namespace *ns) { }
+
+static inline void sem_exit_ns(struct ipc_namespace *ns) { }
+static inline void msg_exit_ns(struct ipc_namespace *ns) { }
+static inline void shm_exit_ns(struct ipc_namespace *ns) { }
+#endif
+
+struct ipc_rcu {
+	struct rcu_head rcu;
+	atomic_t refcount;
+} ____cacheline_aligned_in_smp;
+
+#define ipc_rcu_to_struct(p)  ((void *)(p+1))
+
+/*
+ * Structure that holds the parameters needed by the ipc operations
+ * (see after)
+ */
+struct ipc_params {
+	key_t key;
+	int flg;
+	union {
+		size_t size;	/* for shared memories */
+		int nsems;	/* for semaphores */
+	} u;			/* holds the getnew() specific param */
+};
+
+/*
+ * Structure that holds some ipc operations. This structure is used to unify
+ * the calls to sys_msgget(), sys_semget(), sys_shmget()
+ *      . routine to call to create a new ipc object. Can be one of newque,
+ *        newary, newseg
+ *      . routine to call to check permissions for a new ipc object.
+ *        Can be one of security_msg_associate, security_sem_associate,
+ *        security_shm_associate
+ *      . routine to call for an extra check if needed
+ */
+struct ipc_ops {
+	int (*getnew)(struct ipc_namespace *, struct ipc_params *);
+	int (*associate)(struct kern_ipc_perm *, int);
+	int (*more_checks)(struct kern_ipc_perm *, struct ipc_params *);
+};
+
+struct seq_file;
+struct ipc_ids;
+
+void ipc_init_ids(struct ipc_ids *);
+#ifdef CONFIG_PROC_FS
+void __init ipc_init_proc_interface(const char *path, const char *header,
+		int ids, int (*show)(struct seq_file *, void *));
+#else
+#define ipc_init_proc_interface(path, header, ids, show) do {} while (0)
+#endif
+
+#define IPC_SEM_IDS	0
+#define IPC_MSG_IDS	1
+#define IPC_SHM_IDS	2
+
+#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
+#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
+#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX)
+
+/* must be called with ids->rwsem acquired for writing */
+int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
+
+/* must be called with ids->rwsem acquired for reading */
+int ipc_get_maxid(struct ipc_ids *);
+
+/* must be called with both locks acquired. */
+void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);
+
+/* must be called with ipcp locked */
+int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
+
+/* for rare, potentially huge allocations.
+ * both function can sleep
+ */
+void *ipc_alloc(int size);
+void ipc_free(void *ptr, int size);
+
+/*
+ * For allocation that need to be freed by RCU.
+ * Objects are reference counted, they start with reference count 1.
+ * getref increases the refcount, the putref call that reduces the recount
+ * to 0 schedules the rcu destruction. Caller must guarantee locking.
+ */
+void *ipc_rcu_alloc(int size);
+int ipc_rcu_getref(void *ptr);
+void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head));
+void ipc_rcu_free(struct rcu_head *head);
+
+struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
+struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id);
+
+void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
+void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
+int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
+struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
+					     struct ipc_ids *ids, int id, int cmd,
+					     struct ipc64_perm *perm, int extra_perm);
+
+#ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
+/* On IA-64, we always use the "64-bit version" of the IPC structures.  */
+# define ipc_parse_version(cmd)	IPC_64
+#else
+int ipc_parse_version(int *cmd);
+#endif
+
+extern void free_msg(struct msg_msg *msg);
+extern struct msg_msg *load_msg(const void __user *src, size_t len);
+extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
+extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);
+
+extern void recompute_msgmni(struct ipc_namespace *);
+
+static inline int ipc_buildid(int id, int seq)
+{
+	return SEQ_MULTIPLIER * seq + id;
+}
+
+static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid)
+{
+	return uid / SEQ_MULTIPLIER != ipcp->seq;
+}
+
+static inline void ipc_lock_object(struct kern_ipc_perm *perm)
+{
+	spin_lock(&perm->lock);
+}
+
+static inline void ipc_unlock_object(struct kern_ipc_perm *perm)
+{
+	spin_unlock(&perm->lock);
+}
+
+static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm)
+{
+	assert_spin_locked(&perm->lock);
+}
+
+static inline void ipc_unlock(struct kern_ipc_perm *perm)
+{
+	ipc_unlock_object(perm);
+	rcu_read_unlock();
+}
+
+/*
+ * ipc_valid_object() - helper to sort out IPC_RMID races for codepaths
+ * where the respective ipc_ids.rwsem is not being held down.
+ * Checks whether the ipc object is still around or if it's gone already, as
+ * ipc_rmid() may have already freed the ID while the ipc lock was spinning.
+ * Needs to be called with kern_ipc_perm.lock held -- exception made for one
+ * checkpoint case at sys_semtimedop() as noted in code commentary.
+ */
+static inline bool ipc_valid_object(struct kern_ipc_perm *perm)
+{
+	return !perm->deleted;
+}
+
+struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
+int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
+			const struct ipc_ops *ops, struct ipc_params *params);
+void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
+		void (*free)(struct ipc_namespace *, struct kern_ipc_perm *));
+#endif
author	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
committer	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
commit	57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree	5e910f0e82173f4ef4f51111366a3f1299037a7b /ipc