From 8d91c1e411f55d7ea91b1183a2e9f8088fb4d5be Mon Sep 17 00:00:00 2001
From: André Fabian Silva Delgado <emulatorman@parabola.nu>
Date: Tue, 15 Dec 2015 14:52:16 -0300
Subject: Linux-libre 4.3.2-gnu

---
 drivers/infiniband/core/Makefile        |   3 +-
 drivers/infiniband/core/cache.c         | 773 +++++++++++++++++++++++++++-----
 drivers/infiniband/core/cm.c            | 225 ++++++----
 drivers/infiniband/core/cma.c           | 683 +++++++++++++++++++++-------
 drivers/infiniband/core/core_priv.h     |  54 ++-
 drivers/infiniband/core/device.c        | 335 +++++++++++---
 drivers/infiniband/core/mad.c           |  28 +-
 drivers/infiniband/core/mad_priv.h      |   1 -
 drivers/infiniband/core/multicast.c     |   7 +-
 drivers/infiniband/core/netlink.c       |  55 +++
 drivers/infiniband/core/roce_gid_mgmt.c | 747 ++++++++++++++++++++++++++++++
 drivers/infiniband/core/sa_query.c      | 515 ++++++++++++++++++++-
 drivers/infiniband/core/sysfs.c         |  51 +--
 drivers/infiniband/core/ucm.c           |   9 +-
 drivers/infiniband/core/ucma.c          | 151 ++++++-
 drivers/infiniband/core/user_mad.c      |   6 +-
 drivers/infiniband/core/uverbs.h        |  13 +-
 drivers/infiniband/core/uverbs_cmd.c    | 137 ++++--
 drivers/infiniband/core/uverbs_main.c   | 405 ++++++++++++++---
 drivers/infiniband/core/verbs.c         | 198 ++++----
 20 files changed, 3636 insertions(+), 760 deletions(-)
 create mode 100644 drivers/infiniband/core/roce_gid_mgmt.c

(limited to 'drivers/infiniband/core')

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index acf736764..d43a8994a 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 					$(user_access-y)
 
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
-				device.o fmr_pool.o cache.o netlink.o
+				device.o fmr_pool.o cache.o netlink.o \
+				roce_gid_mgmt.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 871da832d..87471ef37 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -37,6 +37,8 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
 
 #include <rdma/ib_cache.h>
 
@@ -47,76 +49,621 @@ struct ib_pkey_cache {
 	u16             table[0];
 };
 
-struct ib_gid_cache {
-	int             table_len;
-	union ib_gid    table[0];
-};
-
 struct ib_update_work {
 	struct work_struct work;
 	struct ib_device  *device;
 	u8                 port_num;
 };
 
-int ib_get_cached_gid(struct ib_device *device,
-		      u8                port_num,
-		      int               index,
-		      union ib_gid     *gid)
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
+
+static const struct ib_gid_attr zattr;
+
+enum gid_attr_find_mask {
+	GID_ATTR_FIND_MASK_GID          = 1UL << 0,
+	GID_ATTR_FIND_MASK_NETDEV	= 1UL << 1,
+	GID_ATTR_FIND_MASK_DEFAULT	= 1UL << 2,
+};
+
+enum gid_table_entry_props {
+	GID_TABLE_ENTRY_INVALID		= 1UL << 0,
+	GID_TABLE_ENTRY_DEFAULT		= 1UL << 1,
+};
+
+enum gid_table_write_action {
+	GID_TABLE_WRITE_ACTION_ADD,
+	GID_TABLE_WRITE_ACTION_DEL,
+	/* MODIFY only updates the GID table. Currently only used by
+	 * ib_cache_update.
+	 */
+	GID_TABLE_WRITE_ACTION_MODIFY
+};
+
+struct ib_gid_table_entry {
+	/* This lock protects an entry from being
+	 * read and written simultaneously.
+	 */
+	rwlock_t	    lock;
+	unsigned long	    props;
+	union ib_gid        gid;
+	struct ib_gid_attr  attr;
+	void		   *context;
+};
+
+struct ib_gid_table {
+	int                  sz;
+	/* In RoCE, adding a GID to the table requires:
+	 * (a) Find if this GID is already exists.
+	 * (b) Find a free space.
+	 * (c) Write the new GID
+	 *
+	 * Delete requires different set of operations:
+	 * (a) Find the GID
+	 * (b) Delete it.
+	 *
+	 * Add/delete should be carried out atomically.
+	 * This is done by locking this mutex from multiple
+	 * writers. We don't need this lock for IB, as the MAD
+	 * layer replaces all entries. All data_vec entries
+	 * are locked by this lock.
+	 **/
+	struct mutex         lock;
+	struct ib_gid_table_entry *data_vec;
+};
+
+static int write_gid(struct ib_device *ib_dev, u8 port,
+		     struct ib_gid_table *table, int ix,
+		     const union ib_gid *gid,
+		     const struct ib_gid_attr *attr,
+		     enum gid_table_write_action action,
+		     bool  default_gid)
 {
-	struct ib_gid_cache *cache;
+	int ret = 0;
+	struct net_device *old_net_dev;
 	unsigned long flags;
+
+	/* in rdma_cap_roce_gid_table, this funciton should be protected by a
+	 * sleep-able lock.
+	 */
+	write_lock_irqsave(&table->data_vec[ix].lock, flags);
+
+	if (rdma_cap_roce_gid_table(ib_dev, port)) {
+		table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+		write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+		/* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
+		 * RoCE providers and thus only updates the cache.
+		 */
+		if (action == GID_TABLE_WRITE_ACTION_ADD)
+			ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr,
+					      &table->data_vec[ix].context);
+		else if (action == GID_TABLE_WRITE_ACTION_DEL)
+			ret = ib_dev->del_gid(ib_dev, port, ix,
+					      &table->data_vec[ix].context);
+		write_lock_irqsave(&table->data_vec[ix].lock, flags);
+	}
+
+	old_net_dev = table->data_vec[ix].attr.ndev;
+	if (old_net_dev && old_net_dev != attr->ndev)
+		dev_put(old_net_dev);
+	/* if modify_gid failed, just delete the old gid */
+	if (ret || action == GID_TABLE_WRITE_ACTION_DEL) {
+		gid = &zgid;
+		attr = &zattr;
+		table->data_vec[ix].context = NULL;
+	}
+	if (default_gid)
+		table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
+	memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid));
+	memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr));
+	if (table->data_vec[ix].attr.ndev &&
+	    table->data_vec[ix].attr.ndev != old_net_dev)
+		dev_hold(table->data_vec[ix].attr.ndev);
+
+	table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
+
+	write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+
+	if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
+		struct ib_event event;
+
+		event.device		= ib_dev;
+		event.element.port_num	= port;
+		event.event		= IB_EVENT_GID_CHANGE;
+
+		ib_dispatch_event(&event);
+	}
+	return ret;
+}
+
+static int add_gid(struct ib_device *ib_dev, u8 port,
+		   struct ib_gid_table *table, int ix,
+		   const union ib_gid *gid,
+		   const struct ib_gid_attr *attr,
+		   bool  default_gid) {
+	return write_gid(ib_dev, port, table, ix, gid, attr,
+			 GID_TABLE_WRITE_ACTION_ADD, default_gid);
+}
+
+static int modify_gid(struct ib_device *ib_dev, u8 port,
+		      struct ib_gid_table *table, int ix,
+		      const union ib_gid *gid,
+		      const struct ib_gid_attr *attr,
+		      bool  default_gid) {
+	return write_gid(ib_dev, port, table, ix, gid, attr,
+			 GID_TABLE_WRITE_ACTION_MODIFY, default_gid);
+}
+
+static int del_gid(struct ib_device *ib_dev, u8 port,
+		   struct ib_gid_table *table, int ix,
+		   bool  default_gid) {
+	return write_gid(ib_dev, port, table, ix, &zgid, &zattr,
+			 GID_TABLE_WRITE_ACTION_DEL, default_gid);
+}
+
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+		    const struct ib_gid_attr *val, bool default_gid,
+		    unsigned long mask)
+{
+	int i;
+
+	for (i = 0; i < table->sz; i++) {
+		unsigned long flags;
+		struct ib_gid_attr *attr = &table->data_vec[i].attr;
+
+		read_lock_irqsave(&table->data_vec[i].lock, flags);
+
+		if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+			goto next;
+
+		if (mask & GID_ATTR_FIND_MASK_GID &&
+		    memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+			goto next;
+
+		if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+		    attr->ndev != val->ndev)
+			goto next;
+
+		if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+		    !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
+		    default_gid)
+			goto next;
+
+		read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+		return i;
+next:
+		read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+	}
+
+	return -1;
+}
+
+static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
+{
+	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	int ix;
 	int ret = 0;
+	struct net_device *idev;
 
-	if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+	table = ports_table[port - rdma_start_port(ib_dev)];
+
+	if (!memcmp(gid, &zgid, sizeof(*gid)))
 		return -EINVAL;
 
-	read_lock_irqsave(&device->cache.lock, flags);
+	if (ib_dev->get_netdev) {
+		idev = ib_dev->get_netdev(ib_dev, port);
+		if (idev && attr->ndev != idev) {
+			union ib_gid default_gid;
 
-	cache = device->cache.gid_cache[port_num - rdma_start_port(device)];
+			/* Adding default GIDs in not permitted */
+			make_default_gid(idev, &default_gid);
+			if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+				dev_put(idev);
+				return -EPERM;
+			}
+		}
+		if (idev)
+			dev_put(idev);
+	}
 
-	if (index < 0 || index >= cache->table_len)
-		ret = -EINVAL;
-	else
-		*gid = cache->table[index];
+	mutex_lock(&table->lock);
 
-	read_unlock_irqrestore(&device->cache.lock, flags);
+	ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
+		      GID_ATTR_FIND_MASK_NETDEV);
+	if (ix >= 0)
+		goto out_unlock;
 
+	ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
+		      GID_ATTR_FIND_MASK_DEFAULT);
+	if (ix < 0) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+
+	add_gid(ib_dev, port, table, ix, gid, attr, false);
+
+out_unlock:
+	mutex_unlock(&table->lock);
 	return ret;
 }
-EXPORT_SYMBOL(ib_get_cached_gid);
 
-int ib_find_cached_gid(struct ib_device   *device,
-		       const union ib_gid *gid,
-		       u8                 *port_num,
-		       u16                *index)
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr)
 {
-	struct ib_gid_cache *cache;
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	int ix;
+
+	table = ports_table[port - rdma_start_port(ib_dev)];
+
+	mutex_lock(&table->lock);
+
+	ix = find_gid(table, gid, attr, false,
+		      GID_ATTR_FIND_MASK_GID	  |
+		      GID_ATTR_FIND_MASK_NETDEV	  |
+		      GID_ATTR_FIND_MASK_DEFAULT);
+	if (ix < 0)
+		goto out_unlock;
+
+	del_gid(ib_dev, port, table, ix, false);
+
+out_unlock:
+	mutex_unlock(&table->lock);
+	return 0;
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+				     struct net_device *ndev)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	int ix;
+
+	table  = ports_table[port - rdma_start_port(ib_dev)];
+
+	mutex_lock(&table->lock);
+
+	for (ix = 0; ix < table->sz; ix++)
+		if (table->data_vec[ix].attr.ndev == ndev)
+			del_gid(ib_dev, port, table, ix, false);
+
+	mutex_unlock(&table->lock);
+	return 0;
+}
+
+static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
+			      union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
 	unsigned long flags;
-	int p, i;
-	int ret = -ENOENT;
 
-	*port_num = -1;
-	if (index)
-		*index = -1;
+	table = ports_table[port - rdma_start_port(ib_dev)];
 
-	read_lock_irqsave(&device->cache.lock, flags);
+	if (index < 0 || index >= table->sz)
+		return -EINVAL;
 
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-		cache = device->cache.gid_cache[p];
-		for (i = 0; i < cache->table_len; ++i) {
-			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
-				*port_num = p + rdma_start_port(device);
-				if (index)
-					*index = i;
-				ret = 0;
-				goto found;
-			}
+	read_lock_irqsave(&table->data_vec[index].lock, flags);
+	if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
+		read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+		return -EAGAIN;
+	}
+
+	memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+	if (attr) {
+		memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+		if (attr->ndev)
+			dev_hold(attr->ndev);
+	}
+
+	read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+	return 0;
+}
+
+static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
+				    const union ib_gid *gid,
+				    const struct ib_gid_attr *val,
+				    unsigned long mask,
+				    u8 *port, u16 *index)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	u8 p;
+	int local_index;
+
+	for (p = 0; p < ib_dev->phys_port_cnt; p++) {
+		table = ports_table[p];
+		local_index = find_gid(table, gid, val, false, mask);
+		if (local_index >= 0) {
+			if (index)
+				*index = local_index;
+			if (port)
+				*port = p + rdma_start_port(ib_dev);
+			return 0;
 		}
 	}
-found:
-	read_unlock_irqrestore(&device->cache.lock, flags);
 
-	return ret;
+	return -ENOENT;
+}
+
+static int ib_cache_gid_find(struct ib_device *ib_dev,
+			     const union ib_gid *gid,
+			     struct net_device *ndev, u8 *port,
+			     u16 *index)
+{
+	unsigned long mask = GID_ATTR_FIND_MASK_GID;
+	struct ib_gid_attr gid_attr_val = {.ndev = ndev};
+
+	if (ndev)
+		mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+	return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
+					mask, port, index);
+}
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+			      const union ib_gid *gid,
+			      u8 port, struct net_device *ndev,
+			      u16 *index)
+{
+	int local_index;
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	struct ib_gid_table *table;
+	unsigned long mask = GID_ATTR_FIND_MASK_GID;
+	struct ib_gid_attr val = {.ndev = ndev};
+
+	if (port < rdma_start_port(ib_dev) ||
+	    port > rdma_end_port(ib_dev))
+		return -ENOENT;
+
+	table = ports_table[port - rdma_start_port(ib_dev)];
+
+	if (ndev)
+		mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+	local_index = find_gid(table, gid, &val, false, mask);
+	if (local_index >= 0) {
+		if (index)
+			*index = local_index;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+	unsigned int i;
+	struct ib_gid_table *table =
+		kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+	if (!table->data_vec)
+		goto err_free_table;
+
+	mutex_init(&table->lock);
+
+	table->sz = sz;
+
+	for (i = 0; i < sz; i++)
+		rwlock_init(&table->data_vec[i].lock);
+
+	return table;
+
+err_free_table:
+	kfree(table);
+	return NULL;
+}
+
+static void release_gid_table(struct ib_gid_table *table)
+{
+	if (table) {
+		kfree(table->data_vec);
+		kfree(table);
+	}
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+				   struct ib_gid_table *table)
+{
+	int i;
+
+	if (!table)
+		return;
+
+	for (i = 0; i < table->sz; ++i) {
+		if (memcmp(&table->data_vec[i].gid, &zgid,
+			   sizeof(table->data_vec[i].gid)))
+			del_gid(ib_dev, port, table, i,
+				table->data_vec[i].props &
+				GID_ATTR_FIND_MASK_DEFAULT);
+	}
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+				  struct net_device *ndev,
+				  enum ib_cache_gid_default_mode mode)
+{
+	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
+	struct ib_gid_table *table;
+	int ix;
+	union ib_gid current_gid;
+	struct ib_gid_attr current_gid_attr = {};
+
+	table  = ports_table[port - rdma_start_port(ib_dev)];
+
+	make_default_gid(ndev, &gid);
+	memset(&gid_attr, 0, sizeof(gid_attr));
+	gid_attr.ndev = ndev;
+
+	mutex_lock(&table->lock);
+	ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
+
+	/* Coudn't find default GID location */
+	WARN_ON(ix < 0);
+
+	if (!__ib_cache_gid_get(ib_dev, port, ix,
+				&current_gid, &current_gid_attr) &&
+	    mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+	    !memcmp(&gid, &current_gid, sizeof(gid)) &&
+	    !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+		goto unlock;
+
+	if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+	     memcmp(&current_gid_attr, &zattr,
+		    sizeof(current_gid_attr))) &&
+	    del_gid(ib_dev, port, table, ix, true)) {
+		pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+			ix, gid.raw);
+		goto unlock;
+	}
+
+	if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
+		if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+			pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+				gid.raw);
+
+unlock:
+	if (current_gid_attr.ndev)
+		dev_put(current_gid_attr.ndev);
+	mutex_unlock(&table->lock);
+}
+
+static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+				     struct ib_gid_table *table)
+{
+	if (rdma_protocol_roce(ib_dev, port)) {
+		struct ib_gid_table_entry *entry = &table->data_vec[0];
+
+		entry->props |= GID_TABLE_ENTRY_DEFAULT;
+	}
+
+	return 0;
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+	u8 port;
+	struct ib_gid_table **table;
+	int err = 0;
+
+	table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL);
+
+	if (!table) {
+		pr_warn("failed to allocate ib gid cache for %s\n",
+			ib_dev->name);
+		return -ENOMEM;
+	}
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+		u8 rdma_port = port + rdma_start_port(ib_dev);
+
+		table[port] =
+			alloc_gid_table(
+				ib_dev->port_immutable[rdma_port].gid_tbl_len);
+		if (!table[port]) {
+			err = -ENOMEM;
+			goto rollback_table_setup;
+		}
+
+		err = gid_table_reserve_default(ib_dev,
+						port + rdma_start_port(ib_dev),
+						table[port]);
+		if (err)
+			goto rollback_table_setup;
+	}
+
+	ib_dev->cache.gid_cache = table;
+	return 0;
+
+rollback_table_setup:
+	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+				       table[port]);
+		release_gid_table(table[port]);
+	}
+
+	kfree(table);
+	return err;
+}
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+	struct ib_gid_table **table = ib_dev->cache.gid_cache;
+	u8 port;
+
+	if (!table)
+		return;
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++)
+		release_gid_table(table[port]);
+
+	kfree(table);
+	ib_dev->cache.gid_cache = NULL;
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+	struct ib_gid_table **table = ib_dev->cache.gid_cache;
+	u8 port;
+
+	if (!table)
+		return;
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++)
+		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+				       table[port]);
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+	int err;
+
+	err = _gid_table_setup_one(ib_dev);
+
+	if (err)
+		return err;
+
+	err = roce_rescan_device(ib_dev);
+
+	if (err) {
+		gid_table_cleanup_one(ib_dev);
+		gid_table_release_one(ib_dev);
+	}
+
+	return err;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+		      u8                port_num,
+		      int               index,
+		      union ib_gid     *gid)
+{
+	if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+		return -EINVAL;
+
+	return __ib_cache_gid_get(device, port_num, index, gid, NULL);
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+		       const union ib_gid *gid,
+		       u8               *port_num,
+		       u16              *index)
+{
+	return ib_cache_gid_find(device, gid, NULL, port_num, index);
 }
 EXPORT_SYMBOL(ib_find_cached_gid);
 
@@ -243,9 +790,21 @@ static void ib_cache_update(struct ib_device *device,
 {
 	struct ib_port_attr       *tprops = NULL;
 	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
-	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache;
+	struct ib_gid_cache {
+		int             table_len;
+		union ib_gid    table[0];
+	}			  *gid_cache = NULL;
 	int                        i;
 	int                        ret;
+	struct ib_gid_table	  *table;
+	struct ib_gid_table	 **ports_table = device->cache.gid_cache;
+	bool			   use_roce_gid_table =
+					rdma_cap_roce_gid_table(device, port);
+
+	if (port < rdma_start_port(device) || port > rdma_end_port(device))
+		return;
+
+	table = ports_table[port - rdma_start_port(device)];
 
 	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
 	if (!tprops)
@@ -265,12 +824,14 @@ static void ib_cache_update(struct ib_device *device,
 
 	pkey_cache->table_len = tprops->pkey_tbl_len;
 
-	gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
-			    sizeof *gid_cache->table, GFP_KERNEL);
-	if (!gid_cache)
-		goto err;
+	if (!use_roce_gid_table) {
+		gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len *
+			    sizeof(*gid_cache->table), GFP_KERNEL);
+		if (!gid_cache)
+			goto err;
 
-	gid_cache->table_len = tprops->gid_tbl_len;
+		gid_cache->table_len = tprops->gid_tbl_len;
+	}
 
 	for (i = 0; i < pkey_cache->table_len; ++i) {
 		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
@@ -281,29 +842,36 @@ static void ib_cache_update(struct ib_device *device,
 		}
 	}
 
-	for (i = 0; i < gid_cache->table_len; ++i) {
-		ret = ib_query_gid(device, port, i, gid_cache->table + i);
-		if (ret) {
-			printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
-			       ret, device->name, i);
-			goto err;
+	if (!use_roce_gid_table) {
+		for (i = 0;  i < gid_cache->table_len; ++i) {
+			ret = ib_query_gid(device, port, i,
+					   gid_cache->table + i);
+			if (ret) {
+				printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+				       ret, device->name, i);
+				goto err;
+			}
 		}
 	}
 
 	write_lock_irq(&device->cache.lock);
 
 	old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)];
-	old_gid_cache  = device->cache.gid_cache [port - rdma_start_port(device)];
 
 	device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
-	device->cache.gid_cache [port - rdma_start_port(device)] = gid_cache;
+	if (!use_roce_gid_table) {
+		for (i = 0; i < gid_cache->table_len; i++) {
+			modify_gid(device, port, table, i, gid_cache->table + i,
+				   &zattr, false);
+		}
+	}
 
 	device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
 
 	write_unlock_irq(&device->cache.lock);
 
+	kfree(gid_cache);
 	kfree(old_pkey_cache);
-	kfree(old_gid_cache);
 	kfree(tprops);
 	return;
 
@@ -344,85 +912,88 @@ static void ib_cache_event(struct ib_event_handler *handler,
 	}
 }
 
-static void ib_cache_setup_one(struct ib_device *device)
+int ib_cache_setup_one(struct ib_device *device)
 {
 	int p;
+	int err;
 
 	rwlock_init(&device->cache.lock);
 
 	device->cache.pkey_cache =
-		kmalloc(sizeof *device->cache.pkey_cache *
-			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
-	device->cache.gid_cache =
-		kmalloc(sizeof *device->cache.gid_cache *
+		kzalloc(sizeof *device->cache.pkey_cache *
 			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
-
 	device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
 					  (rdma_end_port(device) -
 					   rdma_start_port(device) + 1),
 					  GFP_KERNEL);
-
-	if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+	if (!device->cache.pkey_cache ||
 	    !device->cache.lmc_cache) {
 		printk(KERN_WARNING "Couldn't allocate cache "
 		       "for %s\n", device->name);
-		goto err;
+		return -ENOMEM;
 	}
 
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-		device->cache.pkey_cache[p] = NULL;
-		device->cache.gid_cache [p] = NULL;
+	err = gid_table_setup_one(device);
+	if (err)
+		/* Allocated memory will be cleaned in the release function */
+		return err;
+
+	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
 		ib_cache_update(device, p + rdma_start_port(device));
-	}
 
 	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
 			      device, ib_cache_event);
-	if (ib_register_event_handler(&device->cache.event_handler))
-		goto err_cache;
-
-	return;
+	err = ib_register_event_handler(&device->cache.event_handler);
+	if (err)
+		goto err;
 
-err_cache:
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-		kfree(device->cache.pkey_cache[p]);
-		kfree(device->cache.gid_cache[p]);
-	}
+	return 0;
 
 err:
-	kfree(device->cache.pkey_cache);
-	kfree(device->cache.gid_cache);
-	kfree(device->cache.lmc_cache);
+	gid_table_cleanup_one(device);
+	return err;
 }
 
-static void ib_cache_cleanup_one(struct ib_device *device)
+void ib_cache_release_one(struct ib_device *device)
 {
 	int p;
 
-	ib_unregister_event_handler(&device->cache.event_handler);
-	flush_workqueue(ib_wq);
-
-	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-		kfree(device->cache.pkey_cache[p]);
-		kfree(device->cache.gid_cache[p]);
-	}
-
+	/*
+	 * The release function frees all the cache elements.
+	 * This function should be called as part of freeing
+	 * all the device's resources when the cache could no
+	 * longer be accessed.
+	 */
+	if (device->cache.pkey_cache)
+		for (p = 0;
+		     p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+			kfree(device->cache.pkey_cache[p]);
+
+	gid_table_release_one(device);
 	kfree(device->cache.pkey_cache);
-	kfree(device->cache.gid_cache);
 	kfree(device->cache.lmc_cache);
 }
 
-static struct ib_client cache_client = {
-	.name   = "cache",
-	.add    = ib_cache_setup_one,
-	.remove = ib_cache_cleanup_one
-};
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+	/* The cleanup function unregisters the event handler,
+	 * waits for all in-progress workqueue elements and cleans
+	 * up the GID cache. This function should be called after
+	 * the device was removed from the devices list and all
+	 * clients were removed, so the cache exists but is
+	 * non-functional and shouldn't be updated anymore.
+	 */
+	ib_unregister_event_handler(&device->cache.event_handler);
+	flush_workqueue(ib_wq);
+	gid_table_cleanup_one(device);
+}
 
-int __init ib_cache_setup(void)
+void __init ib_cache_setup(void)
 {
-	return ib_register_client(&cache_client);
+	roce_gid_mgmt_init();
 }
 
 void __exit ib_cache_cleanup(void)
 {
-	ib_unregister_client(&cache_client);
+	roce_gid_mgmt_cleanup();
 }
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3a972ebf3..4f918b929 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -58,7 +58,7 @@ MODULE_DESCRIPTION("InfiniBand CM");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static void cm_add_one(struct ib_device *device);
-static void cm_remove_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client cm_client = {
 	.name   = "cm",
@@ -213,13 +213,15 @@ struct cm_id_private {
 	spinlock_t lock;	/* Do not acquire inside cm.lock */
 	struct completion comp;
 	atomic_t refcount;
+	/* Number of clients sharing this ib_cm_id. Only valid for listeners.
+	 * Protected by the cm.lock spinlock. */
+	int listen_sharecount;
 
 	struct ib_mad_send_buf *msg;
 	struct cm_timewait_info *timewait_info;
 	/* todo: use alternate port on send failure */
 	struct cm_av av;
 	struct cm_av alt_av;
-	struct ib_cm_compare_data *compare_data;
 
 	void *private_data;
 	__be64 tid;
@@ -440,40 +442,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
 	return cm_id_priv;
 }
 
-static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask)
-{
-	int i;
-
-	for (i = 0; i < IB_CM_COMPARE_SIZE; i++)
-		dst[i] = src[i] & mask[i];
-}
-
-static int cm_compare_data(struct ib_cm_compare_data *src_data,
-			   struct ib_cm_compare_data *dst_data)
-{
-	u32 src[IB_CM_COMPARE_SIZE];
-	u32 dst[IB_CM_COMPARE_SIZE];
-
-	if (!src_data || !dst_data)
-		return 0;
-
-	cm_mask_copy(src, src_data->data, dst_data->mask);
-	cm_mask_copy(dst, dst_data->data, src_data->mask);
-	return memcmp(src, dst, sizeof(src));
-}
-
-static int cm_compare_private_data(u32 *private_data,
-				   struct ib_cm_compare_data *dst_data)
-{
-	u32 src[IB_CM_COMPARE_SIZE];
-
-	if (!dst_data)
-		return 0;
-
-	cm_mask_copy(src, private_data, dst_data->mask);
-	return memcmp(src, dst_data->data, sizeof(src));
-}
-
 /*
  * Trivial helpers to strip endian annotation and compare; the
  * endianness doesn't actually matter since we just need a stable
@@ -506,18 +474,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 	struct cm_id_private *cur_cm_id_priv;
 	__be64 service_id = cm_id_priv->id.service_id;
 	__be64 service_mask = cm_id_priv->id.service_mask;
-	int data_cmp;
 
 	while (*link) {
 		parent = *link;
 		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
 					  service_node);
-		data_cmp = cm_compare_data(cm_id_priv->compare_data,
-					   cur_cm_id_priv->compare_data);
 		if ((cur_cm_id_priv->id.service_mask & service_id) ==
 		    (service_mask & cur_cm_id_priv->id.service_id) &&
-		    (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
-		    !data_cmp)
+		    (cm_id_priv->id.device == cur_cm_id_priv->id.device))
 			return cur_cm_id_priv;
 
 		if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
@@ -528,8 +492,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 			link = &(*link)->rb_left;
 		else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
 			link = &(*link)->rb_right;
-		else if (data_cmp < 0)
-			link = &(*link)->rb_left;
 		else
 			link = &(*link)->rb_right;
 	}
@@ -539,20 +501,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 }
 
 static struct cm_id_private * cm_find_listen(struct ib_device *device,
-					     __be64 service_id,
-					     u32 *private_data)
+					     __be64 service_id)
 {
 	struct rb_node *node = cm.listen_service_table.rb_node;
 	struct cm_id_private *cm_id_priv;
-	int data_cmp;
 
 	while (node) {
 		cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
-		data_cmp = cm_compare_private_data(private_data,
-						   cm_id_priv->compare_data);
 		if ((cm_id_priv->id.service_mask & service_id) ==
 		     cm_id_priv->id.service_id &&
-		    (cm_id_priv->id.device == device) && !data_cmp)
+		    (cm_id_priv->id.device == device))
 			return cm_id_priv;
 
 		if (device < cm_id_priv->id.device)
@@ -563,8 +521,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
 			node = node->rb_left;
 		else if (be64_gt(service_id, cm_id_priv->id.service_id))
 			node = node->rb_right;
-		else if (data_cmp < 0)
-			node = node->rb_left;
 		else
 			node = node->rb_right;
 	}
@@ -859,9 +815,15 @@ retest:
 	spin_lock_irq(&cm_id_priv->lock);
 	switch (cm_id->state) {
 	case IB_CM_LISTEN:
-		cm_id->state = IB_CM_IDLE;
 		spin_unlock_irq(&cm_id_priv->lock);
+
 		spin_lock_irq(&cm.lock);
+		if (--cm_id_priv->listen_sharecount > 0) {
+			/* The id is still shared. */
+			cm_deref_id(cm_id_priv);
+			spin_unlock_irq(&cm.lock);
+			return;
+		}
 		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
 		spin_unlock_irq(&cm.lock);
 		break;
@@ -873,6 +835,11 @@ retest:
 	case IB_CM_SIDR_REQ_RCVD:
 		spin_unlock_irq(&cm_id_priv->lock);
 		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+		spin_lock_irq(&cm.lock);
+		if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
+			rb_erase(&cm_id_priv->sidr_id_node,
+				 &cm.remote_sidr_table);
+		spin_unlock_irq(&cm.lock);
 		break;
 	case IB_CM_REQ_SENT:
 	case IB_CM_MRA_REQ_RCVD:
@@ -930,7 +897,6 @@ retest:
 	wait_for_completion(&cm_id_priv->comp);
 	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
 		cm_free_work(work);
-	kfree(cm_id_priv->compare_data);
 	kfree(cm_id_priv->private_data);
 	kfree(cm_id_priv);
 }
@@ -941,11 +907,23 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
 }
 EXPORT_SYMBOL(ib_destroy_cm_id);
 
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
-		 struct ib_cm_compare_data *compare_data)
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ *   connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ *   range of service IDs.  If set to 0, the service ID is matched
+ *   exactly.  This parameter is ignored if %service_id is set to
+ *   IB_CM_ASSIGN_SERVICE_ID.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+			  __be64 service_mask)
 {
 	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
-	unsigned long flags;
 	int ret = 0;
 
 	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -958,20 +936,9 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
 	if (cm_id->state != IB_CM_IDLE)
 		return -EINVAL;
 
-	if (compare_data) {
-		cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
-						   GFP_KERNEL);
-		if (!cm_id_priv->compare_data)
-			return -ENOMEM;
-		cm_mask_copy(cm_id_priv->compare_data->data,
-			     compare_data->data, compare_data->mask);
-		memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
-		       sizeof(compare_data->mask));
-	}
-
 	cm_id->state = IB_CM_LISTEN;
+	++cm_id_priv->listen_sharecount;
 
-	spin_lock_irqsave(&cm.lock, flags);
 	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
 		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
 		cm_id->service_mask = ~cpu_to_be64(0);
@@ -980,18 +947,95 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
 		cm_id->service_mask = service_mask;
 	}
 	cur_cm_id_priv = cm_insert_listen(cm_id_priv);
-	spin_unlock_irqrestore(&cm.lock, flags);
 
 	if (cur_cm_id_priv) {
 		cm_id->state = IB_CM_IDLE;
-		kfree(cm_id_priv->compare_data);
-		cm_id_priv->compare_data = NULL;
+		--cm_id_priv->listen_sharecount;
 		ret = -EBUSY;
 	}
 	return ret;
 }
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	ret = __ib_cm_listen(cm_id, service_id, service_mask);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	return ret;
+}
 EXPORT_SYMBOL(ib_cm_listen);
 
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id.  All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+				     ib_cm_handler cm_handler,
+				     __be64 service_id)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_cm_id *cm_id;
+	unsigned long flags;
+	int err = 0;
+
+	/* Create an ID in advance, since the creation may sleep */
+	cm_id = ib_create_cm_id(device, cm_handler, NULL);
+	if (IS_ERR(cm_id))
+		return cm_id;
+
+	spin_lock_irqsave(&cm.lock, flags);
+
+	if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+		goto new_id;
+
+	/* Find an existing ID */
+	cm_id_priv = cm_find_listen(device, service_id);
+	if (cm_id_priv) {
+		if (cm_id->cm_handler != cm_handler || cm_id->context) {
+			/* Sharing an ib_cm_id with different handlers is not
+			 * supported */
+			spin_unlock_irqrestore(&cm.lock, flags);
+			return ERR_PTR(-EINVAL);
+		}
+		atomic_inc(&cm_id_priv->refcount);
+		++cm_id_priv->listen_sharecount;
+		spin_unlock_irqrestore(&cm.lock, flags);
+
+		ib_destroy_cm_id(cm_id);
+		cm_id = &cm_id_priv->id;
+		return cm_id;
+	}
+
+new_id:
+	/* Use newly created ID */
+	err = __ib_cm_listen(cm_id, service_id, 0);
+
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	if (err) {
+		ib_destroy_cm_id(cm_id);
+		return ERR_PTR(err);
+	}
+	return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
 static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
 			  enum cm_msg_sequence msg_seq)
 {
@@ -1268,6 +1312,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
 	primary_path->packet_life_time =
 		cm_req_get_primary_local_ack_timeout(req_msg);
 	primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+	primary_path->service_id = req_msg->service_id;
 
 	if (req_msg->alt_local_lid) {
 		memset(alt_path, 0, sizeof *alt_path);
@@ -1289,9 +1334,28 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
 		alt_path->packet_life_time =
 			cm_req_get_alt_local_ack_timeout(req_msg);
 		alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+		alt_path->service_id = req_msg->service_id;
 	}
 }
 
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+	struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+	u8 port_num = work->port->port_num;
+	u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+	u16 pkey;
+	int ret;
+
+	ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+	if (ret) {
+		dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+				     port_num, pkey_index, ret);
+		return 0;
+	}
+
+	return pkey;
+}
+
 static void cm_format_req_event(struct cm_work *work,
 				struct cm_id_private *cm_id_priv,
 				struct ib_cm_id *listen_id)
@@ -1302,6 +1366,7 @@ static void cm_format_req_event(struct cm_work *work,
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
 	param = &work->cm_event.param.req_rcvd;
 	param->listen_id = listen_id;
+	param->bth_pkey = cm_get_bth_pkey(work);
 	param->port = cm_id_priv->av.port->port_num;
 	param->primary_path = &work->path[0];
 	if (req_msg->alt_local_lid)
@@ -1484,8 +1549,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
 
 	/* Find matching listen request. */
 	listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
-					   req_msg->service_id,
-					   req_msg->private_data);
+					   req_msg->service_id);
 	if (!listen_cm_id_priv) {
 		cm_cleanup_timewait(cm_id_priv->timewait_info);
 		spin_unlock_irq(&cm.lock);
@@ -2992,6 +3056,8 @@ static void cm_format_sidr_req_event(struct cm_work *work,
 	param = &work->cm_event.param.sidr_req_rcvd;
 	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
 	param->listen_id = listen_id;
+	param->service_id = sidr_req_msg->service_id;
+	param->bth_pkey = cm_get_bth_pkey(work);
 	param->port = work->port->port_num;
 	work->cm_event.private_data = &sidr_req_msg->private_data;
 }
@@ -3031,8 +3097,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	}
 	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
 	cur_cm_id_priv = cm_find_listen(cm_id->device,
-					sidr_req_msg->service_id,
-					sidr_req_msg->private_data);
+					sidr_req_msg->service_id);
 	if (!cur_cm_id_priv) {
 		spin_unlock_irq(&cm.lock);
 		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
@@ -3112,7 +3177,10 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	spin_lock_irqsave(&cm.lock, flags);
-	rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) {
+		rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+		RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
+	}
 	spin_unlock_irqrestore(&cm.lock, flags);
 	return 0;
 
@@ -3886,9 +3954,9 @@ free:
 	kfree(cm_dev);
 }
 
-static void cm_remove_one(struct ib_device *ib_device)
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 {
-	struct cm_device *cm_dev;
+	struct cm_device *cm_dev = client_data;
 	struct cm_port *port;
 	struct ib_port_modify port_modify = {
 		.clr_port_cap_mask = IB_PORT_CM_SUP
@@ -3896,7 +3964,6 @@ static void cm_remove_one(struct ib_device *ib_device)
 	unsigned long flags;
 	int i;
 
-	cm_dev = ib_get_client_data(ib_device, &cm_client);
 	if (!cm_dev)
 		return;
 
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 143ded2bb..36b12d560 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -46,6 +46,8 @@
 
 #include <net/tcp.h>
 #include <net/ipv6.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
 
 #include <rdma/rdma_cm.h>
 #include <rdma/rdma_cm_ib.h>
@@ -94,7 +96,7 @@ const char *rdma_event_msg(enum rdma_cm_event_type event)
 EXPORT_SYMBOL(rdma_event_msg);
 
 static void cma_add_one(struct ib_device *device);
-static void cma_remove_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client cma_client = {
 	.name   = "cma",
@@ -113,6 +115,22 @@ static DEFINE_IDR(udp_ps);
 static DEFINE_IDR(ipoib_ps);
 static DEFINE_IDR(ib_ps);
 
+static struct idr *cma_idr(enum rdma_port_space ps)
+{
+	switch (ps) {
+	case RDMA_PS_TCP:
+		return &tcp_ps;
+	case RDMA_PS_UDP:
+		return &udp_ps;
+	case RDMA_PS_IPOIB:
+		return &ipoib_ps;
+	case RDMA_PS_IB:
+		return &ib_ps;
+	default:
+		return NULL;
+	}
+}
+
 struct cma_device {
 	struct list_head	list;
 	struct ib_device	*device;
@@ -122,11 +140,33 @@ struct cma_device {
 };
 
 struct rdma_bind_list {
-	struct idr		*ps;
+	enum rdma_port_space	ps;
 	struct hlist_head	owners;
 	unsigned short		port;
 };
 
+static int cma_ps_alloc(enum rdma_port_space ps,
+			struct rdma_bind_list *bind_list, int snum)
+{
+	struct idr *idr = cma_idr(ps);
+
+	return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(enum rdma_port_space ps, int snum)
+{
+	struct idr *idr = cma_idr(ps);
+
+	return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(enum rdma_port_space ps, int snum)
+{
+	struct idr *idr = cma_idr(ps);
+
+	idr_remove(idr, snum);
+}
+
 enum {
 	CMA_OPTION_AFONLY,
 };
@@ -225,6 +265,15 @@ struct cma_hdr {
 
 #define CMA_VERSION 0x00
 
+struct cma_req_info {
+	struct ib_device *device;
+	int port;
+	union ib_gid local_gid;
+	__be64 service_id;
+	u16 pkey;
+	bool has_gid:1;
+};
+
 static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
 {
 	unsigned long flags;
@@ -262,7 +311,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
 	return old;
 }
 
-static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
 {
 	return hdr->ip_version >> 4;
 }
@@ -870,107 +919,419 @@ static inline int cma_any_port(struct sockaddr *addr)
 	return !cma_port(addr);
 }
 
-static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+static void cma_save_ib_info(struct sockaddr *src_addr,
+			     struct sockaddr *dst_addr,
+			     struct rdma_cm_id *listen_id,
 			     struct ib_sa_path_rec *path)
 {
 	struct sockaddr_ib *listen_ib, *ib;
 
 	listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
-	ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
-	ib->sib_family = listen_ib->sib_family;
-	if (path) {
-		ib->sib_pkey = path->pkey;
-		ib->sib_flowinfo = path->flow_label;
-		memcpy(&ib->sib_addr, &path->sgid, 16);
-	} else {
-		ib->sib_pkey = listen_ib->sib_pkey;
-		ib->sib_flowinfo = listen_ib->sib_flowinfo;
-		ib->sib_addr = listen_ib->sib_addr;
-	}
-	ib->sib_sid = listen_ib->sib_sid;
-	ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
-	ib->sib_scope_id = listen_ib->sib_scope_id;
-
-	if (path) {
-		ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
-		ib->sib_family = listen_ib->sib_family;
-		ib->sib_pkey = path->pkey;
-		ib->sib_flowinfo = path->flow_label;
-		memcpy(&ib->sib_addr, &path->dgid, 16);
+	if (src_addr) {
+		ib = (struct sockaddr_ib *)src_addr;
+		ib->sib_family = AF_IB;
+		if (path) {
+			ib->sib_pkey = path->pkey;
+			ib->sib_flowinfo = path->flow_label;
+			memcpy(&ib->sib_addr, &path->sgid, 16);
+			ib->sib_sid = path->service_id;
+			ib->sib_scope_id = 0;
+		} else {
+			ib->sib_pkey = listen_ib->sib_pkey;
+			ib->sib_flowinfo = listen_ib->sib_flowinfo;
+			ib->sib_addr = listen_ib->sib_addr;
+			ib->sib_sid = listen_ib->sib_sid;
+			ib->sib_scope_id = listen_ib->sib_scope_id;
+		}
+		ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+	}
+	if (dst_addr) {
+		ib = (struct sockaddr_ib *)dst_addr;
+		ib->sib_family = AF_IB;
+		if (path) {
+			ib->sib_pkey = path->pkey;
+			ib->sib_flowinfo = path->flow_label;
+			memcpy(&ib->sib_addr, &path->dgid, 16);
+		}
 	}
 }
 
-static __be16 ss_get_port(const struct sockaddr_storage *ss)
-{
-	if (ss->ss_family == AF_INET)
-		return ((struct sockaddr_in *)ss)->sin_port;
-	else if (ss->ss_family == AF_INET6)
-		return ((struct sockaddr_in6 *)ss)->sin6_port;
-	BUG();
-}
-
-static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
-			      struct cma_hdr *hdr)
+static void cma_save_ip4_info(struct sockaddr *src_addr,
+			      struct sockaddr *dst_addr,
+			      struct cma_hdr *hdr,
+			      __be16 local_port)
 {
 	struct sockaddr_in *ip4;
 
-	ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
-	ip4->sin_family = AF_INET;
-	ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
-	ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr);
+	if (src_addr) {
+		ip4 = (struct sockaddr_in *)src_addr;
+		ip4->sin_family = AF_INET;
+		ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+		ip4->sin_port = local_port;
+	}
 
-	ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
-	ip4->sin_family = AF_INET;
-	ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
-	ip4->sin_port = hdr->port;
+	if (dst_addr) {
+		ip4 = (struct sockaddr_in *)dst_addr;
+		ip4->sin_family = AF_INET;
+		ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+		ip4->sin_port = hdr->port;
+	}
 }
 
-static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
-			      struct cma_hdr *hdr)
+static void cma_save_ip6_info(struct sockaddr *src_addr,
+			      struct sockaddr *dst_addr,
+			      struct cma_hdr *hdr,
+			      __be16 local_port)
 {
 	struct sockaddr_in6 *ip6;
 
-	ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
-	ip6->sin6_family = AF_INET6;
-	ip6->sin6_addr = hdr->dst_addr.ip6;
-	ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr);
+	if (src_addr) {
+		ip6 = (struct sockaddr_in6 *)src_addr;
+		ip6->sin6_family = AF_INET6;
+		ip6->sin6_addr = hdr->dst_addr.ip6;
+		ip6->sin6_port = local_port;
+	}
 
-	ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
-	ip6->sin6_family = AF_INET6;
-	ip6->sin6_addr = hdr->src_addr.ip6;
-	ip6->sin6_port = hdr->port;
+	if (dst_addr) {
+		ip6 = (struct sockaddr_in6 *)dst_addr;
+		ip6->sin6_family = AF_INET6;
+		ip6->sin6_addr = hdr->src_addr.ip6;
+		ip6->sin6_port = hdr->port;
+	}
 }
 
-static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
-			     struct ib_cm_event *ib_event)
+static u16 cma_port_from_service_id(__be64 service_id)
 {
-	struct cma_hdr *hdr;
+	return (u16)be64_to_cpu(service_id);
+}
 
-	if (listen_id->route.addr.src_addr.ss_family == AF_IB) {
-		if (ib_event->event == IB_CM_REQ_RECEIVED)
-			cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
-		else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
-			cma_save_ib_info(id, listen_id, NULL);
-		return 0;
-	}
+static int cma_save_ip_info(struct sockaddr *src_addr,
+			    struct sockaddr *dst_addr,
+			    struct ib_cm_event *ib_event,
+			    __be64 service_id)
+{
+	struct cma_hdr *hdr;
+	__be16 port;
 
 	hdr = ib_event->private_data;
 	if (hdr->cma_version != CMA_VERSION)
 		return -EINVAL;
 
+	port = htons(cma_port_from_service_id(service_id));
+
 	switch (cma_get_ip_ver(hdr)) {
 	case 4:
-		cma_save_ip4_info(id, listen_id, hdr);
+		cma_save_ip4_info(src_addr, dst_addr, hdr, port);
 		break;
 	case 6:
-		cma_save_ip6_info(id, listen_id, hdr);
+		cma_save_ip6_info(src_addr, dst_addr, hdr, port);
+		break;
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	return 0;
+}
+
+static int cma_save_net_info(struct sockaddr *src_addr,
+			     struct sockaddr *dst_addr,
+			     struct rdma_cm_id *listen_id,
+			     struct ib_cm_event *ib_event,
+			     sa_family_t sa_family, __be64 service_id)
+{
+	if (sa_family == AF_IB) {
+		if (ib_event->event == IB_CM_REQ_RECEIVED)
+			cma_save_ib_info(src_addr, dst_addr, listen_id,
+					 ib_event->param.req_rcvd.primary_path);
+		else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+			cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
+		return 0;
+	}
+
+	return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+			     struct cma_req_info *req)
+{
+	const struct ib_cm_req_event_param *req_param =
+		&ib_event->param.req_rcvd;
+	const struct ib_cm_sidr_req_event_param *sidr_param =
+		&ib_event->param.sidr_req_rcvd;
+
+	switch (ib_event->event) {
+	case IB_CM_REQ_RECEIVED:
+		req->device	= req_param->listen_id->device;
+		req->port	= req_param->port;
+		memcpy(&req->local_gid, &req_param->primary_path->sgid,
+		       sizeof(req->local_gid));
+		req->has_gid	= true;
+		req->service_id	= req_param->primary_path->service_id;
+		req->pkey	= be16_to_cpu(req_param->primary_path->pkey);
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		req->device	= sidr_param->listen_id->device;
+		req->port	= sidr_param->port;
+		req->has_gid	= false;
+		req->service_id	= sidr_param->service_id;
+		req->pkey	= sidr_param->pkey;
 		break;
 	default:
 		return -EINVAL;
 	}
+
 	return 0;
 }
 
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+				  const struct sockaddr_in *dst_addr,
+				  const struct sockaddr_in *src_addr)
+{
+	__be32 daddr = dst_addr->sin_addr.s_addr,
+	       saddr = src_addr->sin_addr.s_addr;
+	struct fib_result res;
+	struct flowi4 fl4;
+	int err;
+	bool ret;
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+	    ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+	    ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+	    ipv4_is_loopback(saddr))
+		return false;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.flowi4_iif = net_dev->ifindex;
+	fl4.daddr = daddr;
+	fl4.saddr = saddr;
+
+	rcu_read_lock();
+	err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
+	if (err)
+		return false;
+
+	ret = FIB_RES_DEV(res) == net_dev;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+				  const struct sockaddr_in6 *dst_addr,
+				  const struct sockaddr_in6 *src_addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
+			   IPV6_ADDR_LINKLOCAL;
+	struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
+					 &src_addr->sin6_addr, net_dev->ifindex,
+					 strict);
+	bool ret;
+
+	if (!rt)
+		return false;
+
+	ret = rt->rt6i_idev->dev == net_dev;
+	ip6_rt_put(rt);
+
+	return ret;
+#else
+	return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+			     const struct sockaddr *daddr,
+			     const struct sockaddr *saddr)
+{
+	const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+	const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+	const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+	const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+	switch (daddr->sa_family) {
+	case AF_INET:
+		return saddr->sa_family == AF_INET &&
+		       validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+	case AF_INET6:
+		return saddr->sa_family == AF_INET6 &&
+		       validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+	default:
+		return false;
+	}
+}
+
+static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
+					  const struct cma_req_info *req)
+{
+	struct sockaddr_storage listen_addr_storage, src_addr_storage;
+	struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage,
+			*src_addr = (struct sockaddr *)&src_addr_storage;
+	struct net_device *net_dev;
+	const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+	int err;
+
+	err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+			       req->service_id);
+	if (err)
+		return ERR_PTR(err);
+
+	net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey,
+					   gid, listen_addr);
+	if (!net_dev)
+		return ERR_PTR(-ENODEV);
+
+	if (!validate_net_dev(net_dev, listen_addr, src_addr)) {
+		dev_put(net_dev);
+		return ERR_PTR(-EHOSTUNREACH);
+	}
+
+	return net_dev;
+}
+
+static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+	return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+				   const struct cma_hdr *hdr)
+{
+	struct sockaddr *addr = cma_src_addr(id_priv);
+	__be32 ip4_addr;
+	struct in6_addr ip6_addr;
+
+	if (cma_any_addr(addr) && !id_priv->afonly)
+		return true;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+		if (cma_get_ip_ver(hdr) != 4)
+			return false;
+		if (!cma_any_addr(addr) &&
+		    hdr->dst_addr.ip4.addr != ip4_addr)
+			return false;
+		break;
+	case AF_INET6:
+		ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+		if (cma_get_ip_ver(hdr) != 6)
+			return false;
+		if (!cma_any_addr(addr) &&
+		    memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+			return false;
+		break;
+	case AF_IB:
+		return true;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static bool cma_protocol_roce_dev_port(struct ib_device *device, int port_num)
+{
+	enum rdma_link_layer ll = rdma_port_get_link_layer(device, port_num);
+	enum rdma_transport_type transport =
+		rdma_node_get_transport(device->node_type);
+
+	return ll == IB_LINK_LAYER_ETHERNET && transport == RDMA_TRANSPORT_IB;
+}
+
+static bool cma_protocol_roce(const struct rdma_cm_id *id)
+{
+	struct ib_device *device = id->device;
+	const int port_num = id->port_num ?: rdma_start_port(device);
+
+	return cma_protocol_roce_dev_port(device, port_num);
+}
+
+static bool cma_match_net_dev(const struct rdma_id_private *id_priv,
+			      const struct net_device *net_dev)
+{
+	const struct rdma_addr *addr = &id_priv->id.route.addr;
+
+	if (!net_dev)
+		/* This request is an AF_IB request or a RoCE request */
+		return addr->src_addr.ss_family == AF_IB ||
+		       cma_protocol_roce(&id_priv->id);
+
+	return !addr->dev_addr.bound_dev_if ||
+	       (net_eq(dev_net(net_dev), &init_net) &&
+		addr->dev_addr.bound_dev_if == net_dev->ifindex);
+}
+
+static struct rdma_id_private *cma_find_listener(
+		const struct rdma_bind_list *bind_list,
+		const struct ib_cm_id *cm_id,
+		const struct ib_cm_event *ib_event,
+		const struct cma_req_info *req,
+		const struct net_device *net_dev)
+{
+	struct rdma_id_private *id_priv, *id_priv_dev;
+
+	if (!bind_list)
+		return ERR_PTR(-EINVAL);
+
+	hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+		if (cma_match_private_data(id_priv, ib_event->private_data)) {
+			if (id_priv->id.device == cm_id->device &&
+			    cma_match_net_dev(id_priv, net_dev))
+				return id_priv;
+			list_for_each_entry(id_priv_dev,
+					    &id_priv->listen_list,
+					    listen_list) {
+				if (id_priv_dev->id.device == cm_id->device &&
+				    cma_match_net_dev(id_priv_dev, net_dev))
+					return id_priv_dev;
+			}
+		}
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
+						 struct ib_cm_event *ib_event,
+						 struct net_device **net_dev)
+{
+	struct cma_req_info req;
+	struct rdma_bind_list *bind_list;
+	struct rdma_id_private *id_priv;
+	int err;
+
+	err = cma_save_req_info(ib_event, &req);
+	if (err)
+		return ERR_PTR(err);
+
+	*net_dev = cma_get_net_dev(ib_event, &req);
+	if (IS_ERR(*net_dev)) {
+		if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
+			/* Assuming the protocol is AF_IB */
+			*net_dev = NULL;
+		} else if (cma_protocol_roce_dev_port(req.device, req.port)) {
+			/* TODO find the net dev matching the request parameters
+			 * through the RoCE GID table */
+			*net_dev = NULL;
+		} else {
+			return ERR_CAST(*net_dev);
+		}
+	}
+
+	bind_list = cma_ps_find(rdma_ps_from_service_id(req.service_id),
+				cma_port_from_service_id(req.service_id));
+	id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+	if (IS_ERR(id_priv) && *net_dev) {
+		dev_put(*net_dev);
+		*net_dev = NULL;
+	}
+
+	return id_priv;
+}
+
 static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
 {
 	return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -1038,7 +1399,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)
 	mutex_lock(&lock);
 	hlist_del(&id_priv->node);
 	if (hlist_empty(&bind_list->owners)) {
-		idr_remove(bind_list->ps, bind_list->port);
+		cma_ps_remove(bind_list->ps, bind_list->port);
 		kfree(bind_list);
 	}
 	mutex_unlock(&lock);
@@ -1216,11 +1577,15 @@ out:
 }
 
 static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
-					       struct ib_cm_event *ib_event)
+					       struct ib_cm_event *ib_event,
+					       struct net_device *net_dev)
 {
 	struct rdma_id_private *id_priv;
 	struct rdma_cm_id *id;
 	struct rdma_route *rt;
+	const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+	const __be64 service_id =
+		      ib_event->param.req_rcvd.primary_path->service_id;
 	int ret;
 
 	id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1229,7 +1594,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 		return NULL;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (cma_save_net_info(id, listen_id, ib_event))
+	if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+			      (struct sockaddr *)&id->route.addr.dst_addr,
+			      listen_id, ib_event, ss_family, service_id))
 		goto err;
 
 	rt = &id->route;
@@ -1243,14 +1610,21 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
 	if (rt->num_paths == 2)
 		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
 
-	if (cma_any_addr(cma_src_addr(id_priv))) {
-		rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
-		rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
-		ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
-	} else {
-		ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+	if (net_dev) {
+		ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
 		if (ret)
 			goto err;
+	} else {
+		if (!cma_protocol_roce(listen_id) &&
+		    cma_any_addr(cma_src_addr(id_priv))) {
+			rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+			rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+			ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+		} else if (!cma_any_addr(cma_src_addr(id_priv))) {
+			ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+			if (ret)
+				goto err;
+		}
 	}
 	rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
 
@@ -1263,10 +1637,12 @@ err:
 }
 
 static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
-					      struct ib_cm_event *ib_event)
+					      struct ib_cm_event *ib_event,
+					      struct net_device *net_dev)
 {
 	struct rdma_id_private *id_priv;
 	struct rdma_cm_id *id;
+	const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
 	int ret;
 
 	id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1275,13 +1651,23 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
 		return NULL;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (cma_save_net_info(id, listen_id, ib_event))
+	if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+			      (struct sockaddr *)&id->route.addr.dst_addr,
+			      listen_id, ib_event, ss_family,
+			      ib_event->param.sidr_req_rcvd.service_id))
 		goto err;
 
-	if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
-		ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr);
+	if (net_dev) {
+		ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
 		if (ret)
 			goto err;
+	} else {
+		if (!cma_any_addr(cma_src_addr(id_priv))) {
+			ret = cma_translate_addr(cma_src_addr(id_priv),
+						 &id->route.addr.dev_addr);
+			if (ret)
+				goto err;
+		}
 	}
 
 	id_priv->state = RDMA_CM_CONNECT;
@@ -1319,25 +1705,33 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 {
 	struct rdma_id_private *listen_id, *conn_id;
 	struct rdma_cm_event event;
+	struct net_device *net_dev;
 	int offset, ret;
 
-	listen_id = cm_id->context;
-	if (!cma_check_req_qp_type(&listen_id->id, ib_event))
-		return -EINVAL;
+	listen_id = cma_id_from_event(cm_id, ib_event, &net_dev);
+	if (IS_ERR(listen_id))
+		return PTR_ERR(listen_id);
 
-	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
-		return -ECONNABORTED;
+	if (!cma_check_req_qp_type(&listen_id->id, ib_event)) {
+		ret = -EINVAL;
+		goto net_dev_put;
+	}
+
+	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) {
+		ret = -ECONNABORTED;
+		goto net_dev_put;
+	}
 
 	memset(&event, 0, sizeof event);
 	offset = cma_user_data_offset(listen_id);
 	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
 	if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
-		conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+		conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);
 		event.param.ud.private_data = ib_event->private_data + offset;
 		event.param.ud.private_data_len =
 				IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
 	} else {
-		conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+		conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);
 		cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
 				       ib_event->private_data, offset);
 	}
@@ -1375,6 +1769,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	mutex_unlock(&conn_id->handler_mutex);
 	mutex_unlock(&listen_id->handler_mutex);
 	cma_deref_id(conn_id);
+	if (net_dev)
+		dev_put(net_dev);
 	return 0;
 
 err3:
@@ -1388,6 +1784,11 @@ err1:
 	mutex_unlock(&listen_id->handler_mutex);
 	if (conn_id)
 		rdma_destroy_id(&conn_id->id);
+
+net_dev_put:
+	if (net_dev)
+		dev_put(net_dev);
+
 	return ret;
 }
 
@@ -1400,42 +1801,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
 }
 EXPORT_SYMBOL(rdma_get_service_id);
 
-static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
-				 struct ib_cm_compare_data *compare)
-{
-	struct cma_hdr *cma_data, *cma_mask;
-	__be32 ip4_addr;
-	struct in6_addr ip6_addr;
-
-	memset(compare, 0, sizeof *compare);
-	cma_data = (void *) compare->data;
-	cma_mask = (void *) compare->mask;
-
-	switch (addr->sa_family) {
-	case AF_INET:
-		ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
-		cma_set_ip_ver(cma_data, 4);
-		cma_set_ip_ver(cma_mask, 0xF);
-		if (!cma_any_addr(addr)) {
-			cma_data->dst_addr.ip4.addr = ip4_addr;
-			cma_mask->dst_addr.ip4.addr = htonl(~0);
-		}
-		break;
-	case AF_INET6:
-		ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
-		cma_set_ip_ver(cma_data, 6);
-		cma_set_ip_ver(cma_mask, 0xF);
-		if (!cma_any_addr(addr)) {
-			cma_data->dst_addr.ip6 = ip6_addr;
-			memset(&cma_mask->dst_addr.ip6, 0xFF,
-			       sizeof cma_mask->dst_addr.ip6);
-		}
-		break;
-	default:
-		break;
-	}
-}
-
 static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 {
 	struct rdma_id_private *id_priv = iw_id->context;
@@ -1589,33 +1954,18 @@ out:
 
 static int cma_ib_listen(struct rdma_id_private *id_priv)
 {
-	struct ib_cm_compare_data compare_data;
 	struct sockaddr *addr;
 	struct ib_cm_id	*id;
 	__be64 svc_id;
-	int ret;
 
-	id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+	addr = cma_src_addr(id_priv);
+	svc_id = rdma_get_service_id(&id_priv->id, addr);
+	id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);
 	if (IS_ERR(id))
 		return PTR_ERR(id);
-
 	id_priv->cm_id.ib = id;
 
-	addr = cma_src_addr(id_priv);
-	svc_id = rdma_get_service_id(&id_priv->id, addr);
-	if (cma_any_addr(addr) && !id_priv->afonly)
-		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
-	else {
-		cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
-		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
-	}
-
-	if (ret) {
-		ib_destroy_cm_id(id_priv->cm_id.ib);
-		id_priv->cm_id.ib = NULL;
-	}
-
-	return ret;
+	return 0;
 }
 
 static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
@@ -2203,8 +2553,11 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 		src_addr = (struct sockaddr *) &id->route.addr.src_addr;
 		src_addr->sa_family = dst_addr->sa_family;
 		if (dst_addr->sa_family == AF_INET6) {
-			((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
-				((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+			struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
+			struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
+			src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+			if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+				id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
 		} else if (dst_addr->sa_family == AF_IB) {
 			((struct sockaddr_ib *) src_addr)->sib_pkey =
 				((struct sockaddr_ib *) dst_addr)->sib_pkey;
@@ -2325,8 +2678,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list,
 	hlist_add_head(&id_priv->node, &bind_list->owners);
 }
 
-static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
-			  unsigned short snum)
+static int cma_alloc_port(enum rdma_port_space ps,
+			  struct rdma_id_private *id_priv, unsigned short snum)
 {
 	struct rdma_bind_list *bind_list;
 	int ret;
@@ -2335,7 +2688,7 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
 	if (!bind_list)
 		return -ENOMEM;
 
-	ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+	ret = cma_ps_alloc(ps, bind_list, snum);
 	if (ret < 0)
 		goto err;
 
@@ -2348,7 +2701,8 @@ err:
 	return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
 }
 
-static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_alloc_any_port(enum rdma_port_space ps,
+			      struct rdma_id_private *id_priv)
 {
 	static unsigned int last_used_port;
 	int low, high, remaining;
@@ -2359,7 +2713,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
 	rover = prandom_u32() % remaining + low;
 retry:
 	if (last_used_port != rover &&
-	    !idr_find(ps, (unsigned short) rover)) {
+	    !cma_ps_find(ps, (unsigned short)rover)) {
 		int ret = cma_alloc_port(ps, id_priv, rover);
 		/*
 		 * Remember previously used port number in order to avoid
@@ -2414,7 +2768,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
 	return 0;
 }
 
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_use_port(enum rdma_port_space ps,
+			struct rdma_id_private *id_priv)
 {
 	struct rdma_bind_list *bind_list;
 	unsigned short snum;
@@ -2424,7 +2779,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
 	if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
-	bind_list = idr_find(ps, snum);
+	bind_list = cma_ps_find(ps, snum);
 	if (!bind_list) {
 		ret = cma_alloc_port(ps, id_priv, snum);
 	} else {
@@ -2447,25 +2802,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
 	return ret;
 }
 
-static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_inet_ps(
+		struct rdma_id_private *id_priv)
 {
 	switch (id_priv->id.ps) {
 	case RDMA_PS_TCP:
-		return &tcp_ps;
 	case RDMA_PS_UDP:
-		return &udp_ps;
 	case RDMA_PS_IPOIB:
-		return &ipoib_ps;
 	case RDMA_PS_IB:
-		return &ib_ps;
+		return id_priv->id.ps;
 	default:
-		return NULL;
+
+		return 0;
 	}
 }
 
-static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)
 {
-	struct idr *ps = NULL;
+	enum rdma_port_space ps = 0;
 	struct sockaddr_ib *sib;
 	u64 sid_ps, mask, sid;
 
@@ -2475,15 +2829,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
 
 	if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
 		sid_ps = RDMA_IB_IP_PS_IB;
-		ps = &ib_ps;
+		ps = RDMA_PS_IB;
 	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
 		   (sid == (RDMA_IB_IP_PS_TCP & mask))) {
 		sid_ps = RDMA_IB_IP_PS_TCP;
-		ps = &tcp_ps;
+		ps = RDMA_PS_TCP;
 	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
 		   (sid == (RDMA_IB_IP_PS_UDP & mask))) {
 		sid_ps = RDMA_IB_IP_PS_UDP;
-		ps = &udp_ps;
+		ps = RDMA_PS_UDP;
 	}
 
 	if (ps) {
@@ -2496,7 +2850,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
 
 static int cma_get_port(struct rdma_id_private *id_priv)
 {
-	struct idr *ps;
+	enum rdma_port_space ps;
 	int ret;
 
 	if (cma_family(id_priv) != AF_IB)
@@ -3551,11 +3905,10 @@ static void cma_process_remove(struct cma_device *cma_dev)
 	wait_for_completion(&cma_dev->comp);
 }
 
-static void cma_remove_one(struct ib_device *device)
+static void cma_remove_one(struct ib_device *device, void *client_data)
 {
-	struct cma_device *cma_dev;
+	struct cma_device *cma_dev = client_data;
 
-	cma_dev = ib_get_client_data(device, &cma_client);
 	if (!cma_dev)
 		return;
 
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 87d1936f5..70bb36ebb 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -43,12 +43,58 @@ int  ib_device_register_sysfs(struct ib_device *device,
 						   u8, struct kobject *));
 void ib_device_unregister_sysfs(struct ib_device *device);
 
-int  ib_sysfs_setup(void);
-void ib_sysfs_cleanup(void);
-
-int  ib_cache_setup(void);
+void ib_cache_setup(void);
 void ib_cache_cleanup(void);
 
 int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
 			    struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+	      struct net_device *idev, void *cookie);
+
+typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
+	     struct net_device *idev, void *cookie);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+			 roce_netdev_filter filter,
+			 void *filter_cookie,
+			 roce_netdev_callback cb,
+			 void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+			      void *filter_cookie,
+			      roce_netdev_callback cb,
+			      void *cookie);
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+			      const union ib_gid *gid,
+			      u8 port, struct net_device *ndev,
+			      u16 *index);
+
+enum ib_cache_gid_default_mode {
+	IB_CACHE_GID_DEFAULT_MODE_SET,
+	IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+				  struct net_device *ndev,
+				  enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+		     union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+				     struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+int roce_rescan_device(struct ib_device *ib_dev);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 9567756ca..17639117a 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -38,7 +38,10 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/netdevice.h>
 #include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 
 #include "core_priv.h"
 
@@ -50,22 +53,34 @@ struct ib_client_data {
 	struct list_head  list;
 	struct ib_client *client;
 	void *            data;
+	/* The device or client is going down. Do not call client or device
+	 * callbacks other than remove(). */
+	bool		  going_down;
 };
 
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
+/* The device_list and client_list contain devices and clients after their
+ * registration has completed, and the devices and clients are removed
+ * during unregistration. */
 static LIST_HEAD(device_list);
 static LIST_HEAD(client_list);
 
 /*
- * device_mutex protects access to both device_list and client_list.
- * There's no real point to using multiple locks or something fancier
- * like an rwsem: we always access both lists, and we're always
- * modifying one list or the other list.  In any case this is not a
- * hot path so there's no point in trying to optimize.
+ * device_mutex and lists_rwsem protect access to both device_list and
+ * client_list.  device_mutex protects writer access by device and client
+ * registration / de-registration.  lists_rwsem protects reader access to
+ * these lists.  Iterators of these lists must lock it for read, while updates
+ * to the lists must be done with a write lock. A special case is when the
+ * device_mutex is locked. In this case locking the lists for read access is
+ * not necessary as the device_mutex implies it.
+ *
+ * lists_rwsem also protects access to the client data list.
  */
 static DEFINE_MUTEX(device_mutex);
+static DECLARE_RWSEM(lists_rwsem);
+
 
 static int ib_device_check_mandatory(struct ib_device *device)
 {
@@ -152,6 +167,36 @@ static int alloc_name(char *name)
 	return 0;
 }
 
+static void ib_device_release(struct device *device)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	ib_cache_release_one(dev);
+	kfree(dev->port_immutable);
+	kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+			    struct kobj_uevent_env *env)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	if (add_uevent_var(env, "NAME=%s", dev->name))
+		return -ENOMEM;
+
+	/*
+	 * It would be nice to pass the node GUID with the event...
+	 */
+
+	return 0;
+}
+
+static struct class ib_class = {
+	.name    = "infiniband",
+	.dev_release = ib_device_release,
+	.dev_uevent = ib_device_uevent,
+};
+
 /**
  * ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
@@ -164,9 +209,27 @@ static int alloc_name(char *name)
  */
 struct ib_device *ib_alloc_device(size_t size)
 {
-	BUG_ON(size < sizeof (struct ib_device));
+	struct ib_device *device;
+
+	if (WARN_ON(size < sizeof(struct ib_device)))
+		return NULL;
+
+	device = kzalloc(size, GFP_KERNEL);
+	if (!device)
+		return NULL;
+
+	device->dev.class = &ib_class;
+	device_initialize(&device->dev);
+
+	dev_set_drvdata(&device->dev, device);
+
+	INIT_LIST_HEAD(&device->event_handler_list);
+	spin_lock_init(&device->event_handler_lock);
+	spin_lock_init(&device->client_data_lock);
+	INIT_LIST_HEAD(&device->client_data_list);
+	INIT_LIST_HEAD(&device->port_list);
 
-	return kzalloc(size, GFP_KERNEL);
+	return device;
 }
 EXPORT_SYMBOL(ib_alloc_device);
 
@@ -178,13 +241,8 @@ EXPORT_SYMBOL(ib_alloc_device);
  */
 void ib_dealloc_device(struct ib_device *device)
 {
-	if (device->reg_state == IB_DEV_UNINITIALIZED) {
-		kfree(device);
-		return;
-	}
-
-	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
-
+	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
+		device->reg_state != IB_DEV_UNINITIALIZED);
 	kobject_put(&device->dev.kobj);
 }
 EXPORT_SYMBOL(ib_dealloc_device);
@@ -203,10 +261,13 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
 
 	context->client = client;
 	context->data   = NULL;
+	context->going_down = false;
 
+	down_write(&lists_rwsem);
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_add(&context->list, &device->client_data_list);
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	up_write(&lists_rwsem);
 
 	return 0;
 }
@@ -219,7 +280,7 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
 
 static int read_port_immutable(struct ib_device *device)
 {
-	int ret = -ENOMEM;
+	int ret;
 	u8 start_port = rdma_start_port(device);
 	u8 end_port = rdma_end_port(device);
 	u8 port;
@@ -235,26 +296,18 @@ static int read_port_immutable(struct ib_device *device)
 					 * (end_port + 1),
 					 GFP_KERNEL);
 	if (!device->port_immutable)
-		goto err;
+		return -ENOMEM;
 
 	for (port = start_port; port <= end_port; ++port) {
 		ret = device->get_port_immutable(device, port,
 						 &device->port_immutable[port]);
 		if (ret)
-			goto err;
+			return ret;
 
-		if (verify_immutable(device, port)) {
-			ret = -EINVAL;
-			goto err;
-		}
+		if (verify_immutable(device, port))
+			return -EINVAL;
 	}
-
-	ret = 0;
-	goto out;
-err:
-	kfree(device->port_immutable);
-out:
-	return ret;
+	return 0;
 }
 
 /**
@@ -271,6 +324,7 @@ int ib_register_device(struct ib_device *device,
 					    u8, struct kobject *))
 {
 	int ret;
+	struct ib_client *client;
 
 	mutex_lock(&device_mutex);
 
@@ -285,11 +339,6 @@ int ib_register_device(struct ib_device *device,
 		goto out;
 	}
 
-	INIT_LIST_HEAD(&device->event_handler_list);
-	INIT_LIST_HEAD(&device->client_data_list);
-	spin_lock_init(&device->event_handler_lock);
-	spin_lock_init(&device->client_data_lock);
-
 	ret = read_port_immutable(device);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't create per port immutable data %s\n",
@@ -297,27 +346,30 @@ int ib_register_device(struct ib_device *device,
 		goto out;
 	}
 
+	ret = ib_cache_setup_one(device);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+		goto out;
+	}
+
 	ret = ib_device_register_sysfs(device, port_callback);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
 		       device->name);
-		kfree(device->port_immutable);
+		ib_cache_cleanup_one(device);
 		goto out;
 	}
 
-	list_add_tail(&device->core_list, &device_list);
-
 	device->reg_state = IB_DEV_REGISTERED;
 
-	{
-		struct ib_client *client;
-
-		list_for_each_entry(client, &client_list, list)
-			if (client->add && !add_client_context(device, client))
-				client->add(device);
-	}
+	list_for_each_entry(client, &client_list, list)
+		if (client->add && !add_client_context(device, client))
+			client->add(device);
 
- out:
+	down_write(&lists_rwsem);
+	list_add_tail(&device->core_list, &device_list);
+	up_write(&lists_rwsem);
+out:
 	mutex_unlock(&device_mutex);
 	return ret;
 }
@@ -331,26 +383,37 @@ EXPORT_SYMBOL(ib_register_device);
  */
 void ib_unregister_device(struct ib_device *device)
 {
-	struct ib_client *client;
 	struct ib_client_data *context, *tmp;
 	unsigned long flags;
 
 	mutex_lock(&device_mutex);
 
-	list_for_each_entry_reverse(client, &client_list, list)
-		if (client->remove)
-			client->remove(device);
-
+	down_write(&lists_rwsem);
 	list_del(&device->core_list);
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		context->going_down = true;
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	downgrade_write(&lists_rwsem);
+
+	list_for_each_entry_safe(context, tmp, &device->client_data_list,
+				 list) {
+		if (context->client->remove)
+			context->client->remove(device, context->data);
+	}
+	up_read(&lists_rwsem);
 
 	mutex_unlock(&device_mutex);
 
 	ib_device_unregister_sysfs(device);
+	ib_cache_cleanup_one(device);
 
+	down_write(&lists_rwsem);
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
 		kfree(context);
 	spin_unlock_irqrestore(&device->client_data_lock, flags);
+	up_write(&lists_rwsem);
 
 	device->reg_state = IB_DEV_UNREGISTERED;
 }
@@ -375,11 +438,14 @@ int ib_register_client(struct ib_client *client)
 
 	mutex_lock(&device_mutex);
 
-	list_add_tail(&client->list, &client_list);
 	list_for_each_entry(device, &device_list, core_list)
 		if (client->add && !add_client_context(device, client))
 			client->add(device);
 
+	down_write(&lists_rwsem);
+	list_add_tail(&client->list, &client_list);
+	up_write(&lists_rwsem);
+
 	mutex_unlock(&device_mutex);
 
 	return 0;
@@ -402,19 +468,41 @@ void ib_unregister_client(struct ib_client *client)
 
 	mutex_lock(&device_mutex);
 
+	down_write(&lists_rwsem);
+	list_del(&client->list);
+	up_write(&lists_rwsem);
+
 	list_for_each_entry(device, &device_list, core_list) {
-		if (client->remove)
-			client->remove(device);
+		struct ib_client_data *found_context = NULL;
 
+		down_write(&lists_rwsem);
 		spin_lock_irqsave(&device->client_data_lock, flags);
 		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
 			if (context->client == client) {
-				list_del(&context->list);
-				kfree(context);
+				context->going_down = true;
+				found_context = context;
+				break;
 			}
 		spin_unlock_irqrestore(&device->client_data_lock, flags);
+		up_write(&lists_rwsem);
+
+		if (client->remove)
+			client->remove(device, found_context ?
+					       found_context->data : NULL);
+
+		if (!found_context) {
+			pr_warn("No client context found for %s/%s\n",
+				device->name, client->name);
+			continue;
+		}
+
+		down_write(&lists_rwsem);
+		spin_lock_irqsave(&device->client_data_lock, flags);
+		list_del(&found_context->list);
+		kfree(found_context);
+		spin_unlock_irqrestore(&device->client_data_lock, flags);
+		up_write(&lists_rwsem);
 	}
-	list_del(&client->list);
 
 	mutex_unlock(&device_mutex);
 }
@@ -590,10 +678,79 @@ EXPORT_SYMBOL(ib_query_port);
 int ib_query_gid(struct ib_device *device,
 		 u8 port_num, int index, union ib_gid *gid)
 {
+	if (rdma_cap_roce_gid_table(device, port_num))
+		return ib_get_cached_gid(device, port_num, index, gid);
+
 	return device->query_gid(device, port_num, index, gid);
 }
 EXPORT_SYMBOL(ib_query_gid);
 
+/**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+			 roce_netdev_filter filter,
+			 void *filter_cookie,
+			 roce_netdev_callback cb,
+			 void *cookie)
+{
+	u8 port;
+
+	for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
+	     port++)
+		if (rdma_protocol_roce(ib_dev, port)) {
+			struct net_device *idev = NULL;
+
+			if (ib_dev->get_netdev)
+				idev = ib_dev->get_netdev(ib_dev, port);
+
+			if (idev &&
+			    idev->reg_state >= NETREG_UNREGISTERED) {
+				dev_put(idev);
+				idev = NULL;
+			}
+
+			if (filter(ib_dev, port, idev, filter_cookie))
+				cb(ib_dev, port, idev, cookie);
+
+			if (idev)
+				dev_put(idev);
+		}
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+			      void *filter_cookie,
+			      roce_netdev_callback cb,
+			      void *cookie)
+{
+	struct ib_device *dev;
+
+	down_read(&lists_rwsem);
+	list_for_each_entry(dev, &device_list, core_list)
+		ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+	up_read(&lists_rwsem);
+}
+
 /**
  * ib_query_pkey - Get P_Key table entry
  * @device:Device to query
@@ -673,6 +830,14 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
 	int ret, port, i;
 
 	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+		if (rdma_cap_roce_gid_table(device, port)) {
+			if (!ib_cache_gid_find_by_port(device, gid, port,
+						       NULL, index)) {
+				*port_num = port;
+				return 0;
+			}
+		}
+
 		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
 			ret = ib_query_gid(device, port, i, &tmp_gid);
 			if (ret)
@@ -729,6 +894,51 @@ int ib_find_pkey(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_find_pkey);
 
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev:	An RDMA device on which the request has been received.
+ * @port:	Port number on the RDMA device.
+ * @pkey:	The Pkey the request came on.
+ * @gid:	A GID that the net_dev uses to communicate.
+ * @addr:	Contains the IP address that the request specified as its
+ *		destination.
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+					    u8 port,
+					    u16 pkey,
+					    const union ib_gid *gid,
+					    const struct sockaddr *addr)
+{
+	struct net_device *net_dev = NULL;
+	struct ib_client_data *context;
+
+	if (!rdma_protocol_ib(dev, port))
+		return NULL;
+
+	down_read(&lists_rwsem);
+
+	list_for_each_entry(context, &dev->client_data_list, list) {
+		struct ib_client *client = context->client;
+
+		if (context->going_down)
+			continue;
+
+		if (client->get_net_dev_by_params) {
+			net_dev = client->get_net_dev_by_params(dev, port, pkey,
+								gid, addr,
+								context->data);
+			if (net_dev)
+				break;
+		}
+	}
+
+	up_read(&lists_rwsem);
+
+	return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
 static int __init ib_core_init(void)
 {
 	int ret;
@@ -737,7 +947,7 @@ static int __init ib_core_init(void)
 	if (!ib_wq)
 		return -ENOMEM;
 
-	ret = ib_sysfs_setup();
+	ret = class_register(&ib_class);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
 		goto err;
@@ -749,19 +959,12 @@ static int __init ib_core_init(void)
 		goto err_sysfs;
 	}
 
-	ret = ib_cache_setup();
-	if (ret) {
-		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
-		goto err_nl;
-	}
+	ib_cache_setup();
 
 	return 0;
 
-err_nl:
-	ibnl_cleanup();
-
 err_sysfs:
-	ib_sysfs_cleanup();
+	class_unregister(&ib_class);
 
 err:
 	destroy_workqueue(ib_wq);
@@ -772,7 +975,7 @@ static void __exit ib_core_cleanup(void)
 {
 	ib_cache_cleanup();
 	ibnl_cleanup();
-	ib_sysfs_cleanup();
+	class_unregister(&ib_class);
 	/* Make sure that any pending umem accounting work is done. */
 	destroy_workqueue(ib_wq);
 }
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 786fc51bf..4b5c72311 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -338,13 +338,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 		goto error1;
 	}
 
-	mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
-						 IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(mad_agent_priv->agent.mr)) {
-		ret = ERR_PTR(-ENOMEM);
-		goto error2;
-	}
-
 	if (mad_reg_req) {
 		reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
 		if (!reg_req) {
@@ -429,8 +422,6 @@ error4:
 	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
 	kfree(reg_req);
 error3:
-	ib_dereg_mr(mad_agent_priv->agent.mr);
-error2:
 	kfree(mad_agent_priv);
 error1:
 	return ret;
@@ -590,7 +581,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
 	wait_for_completion(&mad_agent_priv->comp);
 
 	kfree(mad_agent_priv->reg_req);
-	ib_dereg_mr(mad_agent_priv->agent.mr);
 	kfree(mad_agent_priv);
 }
 
@@ -1038,7 +1028,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 
 	mad_send_wr->mad_agent_priv = mad_agent_priv;
 	mad_send_wr->sg_list[0].length = hdr_len;
-	mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
 
 	/* OPA MADs don't have to be the full 2048 bytes */
 	if (opa && base_version == OPA_MGMT_BASE_VERSION &&
@@ -1047,7 +1037,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 	else
 		mad_send_wr->sg_list[1].length = mad_size - hdr_len;
 
-	mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
 
 	mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
 	mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
@@ -2885,7 +2875,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
 	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
 
 	/* Initialize common scatter list fields */
-	sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+	sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
 
 	/* Initialize common receive WR fields */
 	recv_wr.next = NULL;
@@ -3201,13 +3191,6 @@ static int ib_mad_port_open(struct ib_device *device,
 		goto error4;
 	}
 
-	port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(port_priv->mr)) {
-		dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
-		ret = PTR_ERR(port_priv->mr);
-		goto error5;
-	}
-
 	if (has_smi) {
 		ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
 		if (ret)
@@ -3248,8 +3231,6 @@ error8:
 error7:
 	destroy_mad_qp(&port_priv->qp_info[0]);
 error6:
-	ib_dereg_mr(port_priv->mr);
-error5:
 	ib_dealloc_pd(port_priv->pd);
 error4:
 	ib_destroy_cq(port_priv->cq);
@@ -3284,7 +3265,6 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
 	destroy_workqueue(port_priv->wq);
 	destroy_mad_qp(&port_priv->qp_info[1]);
 	destroy_mad_qp(&port_priv->qp_info[0]);
-	ib_dereg_mr(port_priv->mr);
 	ib_dealloc_pd(port_priv->pd);
 	ib_destroy_cq(port_priv->cq);
 	cleanup_recv_queue(&port_priv->qp_info[1]);
@@ -3335,7 +3315,7 @@ error:
 	}
 }
 
-static void ib_mad_remove_device(struct ib_device *device)
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
 {
 	int i;
 
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 5be89f989..4a4f7aad0 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -199,7 +199,6 @@ struct ib_mad_port_private {
 	int port_num;
 	struct ib_cq *cq;
 	struct ib_pd *pd;
-	struct ib_mr *mr;
 
 	spinlock_t reg_lock;
 	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index 2cb865c7c..d38d8b2b2 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -43,7 +43,7 @@
 #include "sa.h"
 
 static void mcast_add_one(struct ib_device *device);
-static void mcast_remove_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client mcast_client = {
 	.name   = "ib_multicast",
@@ -840,13 +840,12 @@ static void mcast_add_one(struct ib_device *device)
 	ib_register_event_handler(&dev->event_handler);
 }
 
-static void mcast_remove_one(struct ib_device *device)
+static void mcast_remove_one(struct ib_device *device, void *client_data)
 {
-	struct mcast_device *dev;
+	struct mcast_device *dev = client_data;
 	struct mcast_port *port;
 	int i;
 
-	dev = ib_get_client_data(device, &mcast_client);
 	if (!dev)
 		return;
 
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 23dd5a5c7..d47df9356 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex);
 static struct sock *nls;
 static LIST_HEAD(client_list);
 
+int ibnl_chk_listeners(unsigned int group)
+{
+	if (netlink_has_listeners(nls, group) == 0)
+		return -1;
+	return 0;
+}
+EXPORT_SYMBOL(ibnl_chk_listeners);
+
 int ibnl_add_client(int index, int nops,
 		    const struct ibnl_client_cbs cb_table[])
 {
@@ -151,6 +159,23 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			    !client->cb_table[op].dump)
 				return -EINVAL;
 
+			/*
+			 * For response or local service set_timeout request,
+			 * there is no need to use netlink_dump_start.
+			 */
+			if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+			    (index == RDMA_NL_LS &&
+			     op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
+				struct netlink_callback cb = {
+					.skb = skb,
+					.nlh = nlh,
+					.dump = client->cb_table[op].dump,
+					.module = client->cb_table[op].module,
+				};
+
+				return cb.dump(skb, &cb);
+			}
+
 			{
 				struct netlink_dump_control c = {
 					.dump = client->cb_table[op].dump,
@@ -165,9 +190,39 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	return -EINVAL;
 }
 
+static void ibnl_rcv_reply_skb(struct sk_buff *skb)
+{
+	struct nlmsghdr *nlh;
+	int msglen;
+
+	/*
+	 * Process responses until there is no more message or the first
+	 * request. Generally speaking, it is not recommended to mix responses
+	 * with requests.
+	 */
+	while (skb->len >= nlmsg_total_size(0)) {
+		nlh = nlmsg_hdr(skb);
+
+		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+			return;
+
+		/* Handle response only */
+		if (nlh->nlmsg_flags & NLM_F_REQUEST)
+			return;
+
+		ibnl_rcv_msg(skb, nlh);
+
+		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (msglen > skb->len)
+			msglen = skb->len;
+		skb_pull(skb, msglen);
+	}
+}
+
 static void ibnl_rcv(struct sk_buff *skb)
 {
 	mutex_lock(&ibnl_mutex);
+	ibnl_rcv_reply_skb(skb);
 	netlink_rcv_skb(skb, &ibnl_rcv_msg);
 	mutex_unlock(&ibnl_mutex);
 }
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 000000000..178f98482
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+#include <net/bonding.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+enum gid_op_type {
+	GID_DEL = 0,
+	GID_ADD
+};
+
+struct update_gid_event_work {
+	struct work_struct work;
+	union ib_gid       gid;
+	struct ib_gid_attr gid_attr;
+	enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ		3
+struct netdev_event_work_cmd {
+	roce_netdev_callback	cb;
+	roce_netdev_filter	filter;
+	struct net_device	*ndev;
+	struct net_device	*filter_ndev;
+};
+
+struct netdev_event_work {
+	struct work_struct		work;
+	struct netdev_event_work_cmd	cmds[ROCE_NETDEV_CALLBACK_SZ];
+};
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+		       u8 port, union ib_gid *gid,
+		       struct ib_gid_attr *gid_attr)
+{
+	switch (gid_op) {
+	case GID_ADD:
+		ib_cache_gid_add(ib_dev, port, gid, gid_attr);
+		break;
+	case GID_DEL:
+		ib_cache_gid_del(ib_dev, port, gid, gid_attr);
+		break;
+	}
+}
+
+enum bonding_slave_state {
+	BONDING_SLAVE_STATE_ACTIVE	= 1UL << 0,
+	BONDING_SLAVE_STATE_INACTIVE	= 1UL << 1,
+	/* No primary slave or the device isn't a slave in bonding */
+	BONDING_SLAVE_STATE_NA		= 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
+								   struct net_device *upper)
+{
+	if (upper && netif_is_bond_master(upper)) {
+		struct net_device *pdev =
+			bond_option_active_slave_get_rcu(netdev_priv(upper));
+
+		if (pdev)
+			return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+				BONDING_SLAVE_STATE_INACTIVE;
+	}
+
+	return BONDING_SLAVE_STATE_NA;
+}
+
+static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
+{
+	struct net_device *_upper = NULL;
+	struct list_head *iter;
+
+	netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
+		if (_upper == upper)
+			break;
+
+	return _upper == upper;
+}
+
+#define REQUIRED_BOND_STATES		(BONDING_SLAVE_STATE_ACTIVE |	\
+					 BONDING_SLAVE_STATE_NA)
+static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
+				 struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *event_ndev = (struct net_device *)cookie;
+	struct net_device *real_dev;
+	int res;
+
+	if (!rdma_ndev)
+		return 0;
+
+	rcu_read_lock();
+	real_dev = rdma_vlan_dev_real_dev(event_ndev);
+	if (!real_dev)
+		real_dev = event_ndev;
+
+	res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+	       (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
+		REQUIRED_BOND_STATES)) ||
+	       real_dev == rdma_ndev);
+
+	rcu_read_unlock();
+	return res;
+}
+
+static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port,
+				      struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *master_dev;
+	int res;
+
+	if (!rdma_ndev)
+		return 0;
+
+	rcu_read_lock();
+	master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+	res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
+		BONDING_SLAVE_STATE_INACTIVE;
+	rcu_read_unlock();
+
+	return res;
+}
+
+static int pass_all_filter(struct ib_device *ib_dev, u8 port,
+			   struct net_device *rdma_ndev, void *cookie)
+{
+	return 1;
+}
+
+static int upper_device_filter(struct ib_device *ib_dev, u8 port,
+			       struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *event_ndev = (struct net_device *)cookie;
+	int res;
+
+	if (!rdma_ndev)
+		return 0;
+
+	if (rdma_ndev == event_ndev)
+		return 1;
+
+	rcu_read_lock();
+	res = is_upper_dev_rcu(rdma_ndev, event_ndev);
+	rcu_read_unlock();
+
+	return res;
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+			  struct ib_device *ib_dev,
+			  u8 port, struct net_device *ndev,
+			  struct sockaddr *addr)
+{
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
+
+	rdma_ip2gid(addr, &gid);
+	memset(&gid_attr, 0, sizeof(gid_attr));
+	gid_attr.ndev = ndev;
+
+	update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void enum_netdev_default_gids(struct ib_device *ib_dev,
+				     u8 port, struct net_device *event_ndev,
+				     struct net_device *rdma_ndev)
+{
+	rcu_read_lock();
+	if (!rdma_ndev ||
+	    ((rdma_ndev != event_ndev &&
+	      !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+	     is_eth_active_slave_of_bonding_rcu(rdma_ndev,
+						netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
+	     BONDING_SLAVE_STATE_INACTIVE)) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
+
+	ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+				     IB_CACHE_GID_DEFAULT_MODE_SET);
+}
+
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+					    u8 port,
+					    struct net_device *event_ndev,
+					    struct net_device *rdma_ndev)
+{
+	struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
+
+	if (!rdma_ndev)
+		return;
+
+	if (!real_dev)
+		real_dev = event_ndev;
+
+	rcu_read_lock();
+
+	if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+	    is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
+	    BONDING_SLAVE_STATE_INACTIVE) {
+		rcu_read_unlock();
+
+		ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+					     IB_CACHE_GID_DEFAULT_MODE_DELETE);
+	} else {
+		rcu_read_unlock();
+	}
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+				 u8 port, struct net_device *ndev)
+{
+	struct in_device *in_dev;
+	struct sin_list {
+		struct list_head	list;
+		struct sockaddr_in	ip;
+	};
+	struct sin_list *sin_iter;
+	struct sin_list *sin_temp;
+
+	LIST_HEAD(sin_list);
+	if (ndev->reg_state >= NETREG_UNREGISTERING)
+		return;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(ndev);
+	if (!in_dev) {
+		rcu_read_unlock();
+		return;
+	}
+
+	for_ifa(in_dev) {
+		struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+		if (!entry) {
+			pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv4 update\n");
+			continue;
+		}
+		entry->ip.sin_family = AF_INET;
+		entry->ip.sin_addr.s_addr = ifa->ifa_address;
+		list_add_tail(&entry->list, &sin_list);
+	}
+	endfor_ifa(in_dev);
+	rcu_read_unlock();
+
+	list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
+		update_gid_ip(GID_ADD, ib_dev, port, ndev,
+			      (struct sockaddr *)&sin_iter->ip);
+		list_del(&sin_iter->list);
+		kfree(sin_iter);
+	}
+}
+
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+				 u8 port, struct net_device *ndev)
+{
+	struct inet6_ifaddr *ifp;
+	struct inet6_dev *in6_dev;
+	struct sin6_list {
+		struct list_head	list;
+		struct sockaddr_in6	sin6;
+	};
+	struct sin6_list *sin6_iter;
+	struct sin6_list *sin6_temp;
+	struct ib_gid_attr gid_attr = {.ndev = ndev};
+	LIST_HEAD(sin6_list);
+
+	if (ndev->reg_state >= NETREG_UNREGISTERING)
+		return;
+
+	in6_dev = in6_dev_get(ndev);
+	if (!in6_dev)
+		return;
+
+	read_lock_bh(&in6_dev->lock);
+	list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+		struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+		if (!entry) {
+			pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n");
+			continue;
+		}
+
+		entry->sin6.sin6_family = AF_INET6;
+		entry->sin6.sin6_addr = ifp->addr;
+		list_add_tail(&entry->list, &sin6_list);
+	}
+	read_unlock_bh(&in6_dev->lock);
+
+	in6_dev_put(in6_dev);
+
+	list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+		union ib_gid	gid;
+
+		rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
+		update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+		list_del(&sin6_iter->list);
+		kfree(sin6_iter);
+	}
+}
+
+static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
+			    struct net_device *ndev)
+{
+	enum_netdev_ipv4_ips(ib_dev, port, ndev);
+	if (IS_ENABLED(CONFIG_IPV6))
+		enum_netdev_ipv6_ips(ib_dev, port, ndev);
+}
+
+static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+			   struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *event_ndev = (struct net_device *)cookie;
+
+	enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+	_add_netdev_ips(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+			   struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *event_ndev = (struct net_device *)cookie;
+
+	ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+				    u8 port,
+				    struct net_device *rdma_ndev,
+				    void *cookie)
+{
+	struct net *net;
+	struct net_device *ndev;
+
+	/* Lock the rtnl to make sure the netdevs does not move under
+	 * our feet
+	 */
+	rtnl_lock();
+	for_each_net(net)
+		for_each_netdev(net, ndev)
+			if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev))
+				add_netdev_ips(ib_dev, port, rdma_ndev, ndev);
+	rtnl_unlock();
+}
+
+/* This function will rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices. */
+int roce_rescan_device(struct ib_device *ib_dev)
+{
+	ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
+			    enum_all_gids_of_dev_cb, NULL);
+
+	return 0;
+}
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+					      u8 port,
+					      struct net_device *rdma_ndev,
+					      void *cookie)
+{
+	struct update_gid_event_work *parsed = cookie;
+
+	return update_gid(parsed->gid_op, device,
+			  port, &parsed->gid,
+			  &parsed->gid_attr);
+}
+
+static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
+				void *cookie,
+				void (*handle_netdev)(struct ib_device *ib_dev,
+						      u8 port,
+						      struct net_device *ndev))
+{
+	struct net_device *ndev = (struct net_device *)cookie;
+	struct upper_list {
+		struct list_head list;
+		struct net_device *upper;
+	};
+	struct net_device *upper;
+	struct list_head *iter;
+	struct upper_list *upper_iter;
+	struct upper_list *upper_temp;
+	LIST_HEAD(upper_list);
+
+	rcu_read_lock();
+	netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
+		struct upper_list *entry = kmalloc(sizeof(*entry),
+						   GFP_ATOMIC);
+
+		if (!entry) {
+			pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
+			continue;
+		}
+
+		list_add_tail(&entry->list, &upper_list);
+		dev_hold(upper);
+		entry->upper = upper;
+	}
+	rcu_read_unlock();
+
+	handle_netdev(ib_dev, port, ndev);
+	list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+				 list) {
+		handle_netdev(ib_dev, port, upper_iter->upper);
+		dev_put(upper_iter->upper);
+		list_del(&upper_iter->list);
+		kfree(upper_iter);
+	}
+}
+
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+				      struct net_device *event_ndev)
+{
+	ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+				 struct net_device *rdma_ndev, void *cookie)
+{
+	handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+}
+
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+				 struct net_device *rdma_ndev, void *cookie)
+{
+	handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+					struct net_device *rdma_ndev,
+					void *cookie)
+{
+	struct net_device *master_ndev;
+
+	rcu_read_lock();
+	master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+	if (master_ndev)
+		dev_hold(master_ndev);
+	rcu_read_unlock();
+
+	if (master_ndev) {
+		bond_delete_netdev_default_gids(ib_dev, port, master_ndev,
+						rdma_ndev);
+		dev_put(master_ndev);
+	}
+}
+
+static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port,
+				   struct net_device *rdma_ndev, void *cookie)
+{
+	struct net_device *event_ndev = (struct net_device *)cookie;
+
+	bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+}
+
+/* The following functions operate on all IB devices. netdevice_event and
+ * addr_event execute ib_enum_all_roce_netdevs through a work.
+ * ib_enum_all_roce_netdevs iterates through all IB devices.
+ */
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+	struct netdev_event_work *work =
+		container_of(_work, struct netdev_event_work, work);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+		ib_enum_all_roce_netdevs(work->cmds[i].filter,
+					 work->cmds[i].filter_ndev,
+					 work->cmds[i].cb,
+					 work->cmds[i].ndev);
+		dev_put(work->cmds[i].ndev);
+		dev_put(work->cmds[i].filter_ndev);
+	}
+
+	kfree(work);
+}
+
+static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
+				struct net_device *ndev)
+{
+	unsigned int i;
+	struct netdev_event_work *ndev_work =
+		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+
+	if (!ndev_work) {
+		pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n");
+		return NOTIFY_DONE;
+	}
+
+	memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+	for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+		if (!ndev_work->cmds[i].ndev)
+			ndev_work->cmds[i].ndev = ndev;
+		if (!ndev_work->cmds[i].filter_ndev)
+			ndev_work->cmds[i].filter_ndev = ndev;
+		dev_hold(ndev_work->cmds[i].ndev);
+		dev_hold(ndev_work->cmds[i].filter_ndev);
+	}
+	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+	queue_work(ib_wq, &ndev_work->work);
+
+	return NOTIFY_DONE;
+}
+
+static const struct netdev_event_work_cmd add_cmd = {
+	.cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
+static const struct netdev_event_work_cmd add_cmd_upper_ips = {
+	.cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev};
+
+static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info,
+					struct netdev_event_work_cmd *cmds)
+{
+	static const struct netdev_event_work_cmd upper_ips_del_cmd = {
+		.cb = del_netdev_upper_ips, .filter = upper_device_filter};
+	static const struct netdev_event_work_cmd bonding_default_del_cmd = {
+		.cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave};
+
+	if (changeupper_info->linking == false) {
+		cmds[0] = upper_ips_del_cmd;
+		cmds[0].ndev = changeupper_info->upper_dev;
+		cmds[1] = add_cmd;
+	} else {
+		cmds[0] = bonding_default_del_cmd;
+		cmds[0].ndev = changeupper_info->upper_dev;
+		cmds[1] = add_cmd_upper_ips;
+		cmds[1].ndev = changeupper_info->upper_dev;
+		cmds[1].filter_ndev = changeupper_info->upper_dev;
+	}
+}
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	static const struct netdev_event_work_cmd del_cmd = {
+		.cb = del_netdev_ips, .filter = pass_all_filter};
+	static const struct netdev_event_work_cmd bonding_default_del_cmd_join = {
+		.cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave};
+	static const struct netdev_event_work_cmd default_del_cmd = {
+		.cb = del_netdev_default_ips, .filter = pass_all_filter};
+	static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+		.cb = del_netdev_upper_ips, .filter = upper_device_filter};
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+	if (ndev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+	case NETDEV_UP:
+		cmds[0] = bonding_default_del_cmd_join;
+		cmds[1] = add_cmd;
+		break;
+
+	case NETDEV_UNREGISTER:
+		if (ndev->reg_state < NETREG_UNREGISTERED)
+			cmds[0] = del_cmd;
+		else
+			return NOTIFY_DONE;
+		break;
+
+	case NETDEV_CHANGEADDR:
+		cmds[0] = default_del_cmd;
+		cmds[1] = add_cmd;
+		break;
+
+	case NETDEV_CHANGEUPPER:
+		netdevice_event_changeupper(
+			container_of(ptr, struct netdev_notifier_changeupper_info, info),
+			cmds);
+		break;
+
+	case NETDEV_BONDING_FAILOVER:
+		cmds[0] = bonding_event_ips_del_cmd;
+		cmds[1] = bonding_default_del_cmd_join;
+		cmds[2] = add_cmd_upper_ips;
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return netdevice_queue_work(cmds, ndev);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+	struct update_gid_event_work *work =
+		container_of(_work, struct update_gid_event_work, work);
+
+	ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev,
+				 callback_for_addr_gid_device_scan, work);
+
+	dev_put(work->gid_attr.ndev);
+	kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+		      struct sockaddr *sa, struct net_device *ndev)
+{
+	struct update_gid_event_work *work;
+	enum gid_op_type gid_op;
+
+	if (ndev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		gid_op = GID_ADD;
+		break;
+
+	case NETDEV_DOWN:
+		gid_op = GID_DEL;
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n");
+		return NOTIFY_DONE;
+	}
+
+	INIT_WORK(&work->work, update_gid_event_work_handler);
+
+	rdma_ip2gid(sa, &work->gid);
+	work->gid_op = gid_op;
+
+	memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+	dev_hold(ndev);
+	work->gid_attr.ndev   = ndev;
+
+	queue_work(ib_wq, &work->work);
+
+	return NOTIFY_DONE;
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+			  void *ptr)
+{
+	struct sockaddr_in	in;
+	struct net_device	*ndev;
+	struct in_ifaddr	*ifa = ptr;
+
+	in.sin_family = AF_INET;
+	in.sin_addr.s_addr = ifa->ifa_address;
+	ndev = ifa->ifa_dev->dev;
+
+	return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	struct sockaddr_in6	in6;
+	struct net_device	*ndev;
+	struct inet6_ifaddr	*ifa6 = ptr;
+
+	in6.sin6_family = AF_INET6;
+	in6.sin6_addr = ifa6->addr;
+	ndev = ifa6->idev->dev;
+
+	return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+
+static struct notifier_block nb_netdevice = {
+	.notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+	.notifier_call = inetaddr_event
+};
+
+static struct notifier_block nb_inet6addr = {
+	.notifier_call = inet6addr_event
+};
+
+int __init roce_gid_mgmt_init(void)
+{
+	register_inetaddr_notifier(&nb_inetaddr);
+	if (IS_ENABLED(CONFIG_IPV6))
+		register_inet6addr_notifier(&nb_inet6addr);
+	/* We relay on the netdevice notifier to enumerate all
+	 * existing devices in the system. Register to this notifier
+	 * last to make sure we will not miss any IP add/del
+	 * callbacks.
+	 */
+	register_netdevice_notifier(&nb_netdevice);
+
+	return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+	if (IS_ENABLED(CONFIG_IPV6))
+		unregister_inet6addr_notifier(&nb_inet6addr);
+	unregister_inetaddr_notifier(&nb_inetaddr);
+	unregister_netdevice_notifier(&nb_netdevice);
+	/* Ensure all gid deletion tasks complete before we go down,
+	 * to avoid any reference to free'd memory. By the time
+	 * ib-core is removed, all physical devices have been removed,
+	 * so no issue with remaining hardware contexts.
+	 */
+}
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index ca919f429..8c014b33d 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -45,12 +45,21 @@
 #include <uapi/linux/if_ether.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
 #include "sa.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand subnet administration query support");
 MODULE_LICENSE("Dual BSD/GPL");
 
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX		200000
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
 struct ib_sa_sm_ah {
 	struct ib_ah        *ah;
 	struct kref          ref;
@@ -80,8 +89,16 @@ struct ib_sa_query {
 	struct ib_mad_send_buf *mad_buf;
 	struct ib_sa_sm_ah     *sm_ah;
 	int			id;
+	u32			flags;
+	struct list_head	list; /* Local svc request list */
+	u32			seq; /* Local svc request sequence number */
+	unsigned long		timeout; /* Local svc timeout */
+	u8			path_use; /* How will the pathrecord be used */
 };
 
+#define IB_SA_ENABLE_LOCAL_SERVICE	0x00000001
+#define IB_SA_CANCEL			0x00000002
+
 struct ib_sa_service_query {
 	void (*callback)(int, struct ib_sa_service_rec *, void *);
 	void *context;
@@ -106,8 +123,28 @@ struct ib_sa_mcmember_query {
 	struct ib_sa_query sa_query;
 };
 
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
+	[LS_NLA_TYPE_PATH_RECORD]	= {.type = NLA_BINARY,
+		.len = sizeof(struct ib_path_rec_data)},
+	[LS_NLA_TYPE_TIMEOUT]		= {.type = NLA_U32},
+	[LS_NLA_TYPE_SERVICE_ID]	= {.type = NLA_U64},
+	[LS_NLA_TYPE_DGID]		= {.type = NLA_BINARY,
+		.len = sizeof(struct rdma_nla_ls_gid)},
+	[LS_NLA_TYPE_SGID]		= {.type = NLA_BINARY,
+		.len = sizeof(struct rdma_nla_ls_gid)},
+	[LS_NLA_TYPE_TCLASS]		= {.type = NLA_U8},
+	[LS_NLA_TYPE_PKEY]		= {.type = NLA_U16},
+	[LS_NLA_TYPE_QOS_CLASS]		= {.type = NLA_U16},
+};
+
+
 static void ib_sa_add_one(struct ib_device *device);
-static void ib_sa_remove_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client sa_client = {
 	.name   = "sa",
@@ -381,6 +418,427 @@ static const struct ib_field guidinfo_rec_table[] = {
 	  .size_bits    = 512 },
 };
 
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+	query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
+static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
+{
+	return (query->flags & IB_SA_CANCEL);
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+				     struct ib_sa_query *query)
+{
+	struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1];
+	struct ib_sa_mad *mad = query->mad_buf->mad;
+	ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
+	u16 val16;
+	u64 val64;
+	struct rdma_ls_resolve_header *header;
+
+	query->mad_buf->context[1] = NULL;
+
+	/* Construct the family header first */
+	header = (struct rdma_ls_resolve_header *)
+		skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+	memcpy(header->device_name, query->port->agent->device->name,
+	       LS_DEVICE_NAME_MAX);
+	header->port_num = query->port->port_num;
+
+	if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
+	    sa_rec->reversible != 0)
+		query->path_use = LS_RESOLVE_PATH_USE_GMP;
+	else
+		query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
+	header->path_use = query->path_use;
+
+	/* Now build the attributes */
+	if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
+		val64 = be64_to_cpu(sa_rec->service_id);
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
+			sizeof(val64), &val64);
+	}
+	if (comp_mask & IB_SA_PATH_REC_DGID)
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
+			sizeof(sa_rec->dgid), &sa_rec->dgid);
+	if (comp_mask & IB_SA_PATH_REC_SGID)
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
+			sizeof(sa_rec->sgid), &sa_rec->sgid);
+	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
+			sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
+
+	if (comp_mask & IB_SA_PATH_REC_PKEY) {
+		val16 = be16_to_cpu(sa_rec->pkey);
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
+			sizeof(val16), &val16);
+	}
+	if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
+		val16 = be16_to_cpu(sa_rec->qos_class);
+		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
+			sizeof(val16), &val16);
+	}
+}
+
+static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
+{
+	int len = 0;
+
+	if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+		len += nla_total_size(sizeof(u64));
+	if (comp_mask & IB_SA_PATH_REC_DGID)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+	if (comp_mask & IB_SA_PATH_REC_SGID)
+		len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+		len += nla_total_size(sizeof(u8));
+	if (comp_mask & IB_SA_PATH_REC_PKEY)
+		len += nla_total_size(sizeof(u16));
+	if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+		len += nla_total_size(sizeof(u16));
+
+	/*
+	 * Make sure that at least some of the required comp_mask bits are
+	 * set.
+	 */
+	if (WARN_ON(len == 0))
+		return len;
+
+	/* Add the family header */
+	len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
+
+	return len;
+}
+
+static int ib_nl_send_msg(struct ib_sa_query *query)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	void *data;
+	int ret = 0;
+	struct ib_sa_mad *mad;
+	int len;
+
+	mad = query->mad_buf->mad;
+	len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
+	if (len <= 0)
+		return -EMSGSIZE;
+
+	skb = nlmsg_new(len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	/* Put nlmsg header only for now */
+	data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
+			    RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
+	if (!data) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	/* Add attributes */
+	ib_nl_set_path_rec_attrs(skb, query);
+
+	/* Repair the nlmsg header length */
+	nlmsg_end(skb, nlh);
+
+	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+	if (!ret)
+		ret = len;
+	else
+		ret = 0;
+
+	return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query)
+{
+	unsigned long flags;
+	unsigned long delay;
+	int ret;
+
+	INIT_LIST_HEAD(&query->list);
+	query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	ret = ib_nl_send_msg(query);
+	if (ret <= 0) {
+		ret = -EIO;
+		goto request_out;
+	} else {
+		ret = 0;
+	}
+
+	delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+	query->timeout = delay + jiffies;
+	list_add_tail(&query->list, &ib_nl_request_list);
+	/* Start the timeout if this is the only request */
+	if (ib_nl_request_list.next == &query->list)
+		queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+
+request_out:
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_sa_query *wait_query;
+	int found = 0;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(wait_query, &ib_nl_request_list, list) {
+		/* Let the timeout to take care of the callback */
+		if (query == wait_query) {
+			query->flags |= IB_SA_CANCEL;
+			query->timeout = jiffies;
+			list_move(&query->list, &ib_nl_request_list);
+			found = 1;
+			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+	return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+					   const struct nlmsghdr *nlh)
+{
+	struct ib_mad_send_wc mad_send_wc;
+	struct ib_sa_mad *mad = NULL;
+	const struct nlattr *head, *curr;
+	struct ib_path_rec_data  *rec;
+	int len, rem;
+	u32 mask = 0;
+	int status = -EIO;
+
+	if (query->callback) {
+		head = (const struct nlattr *) nlmsg_data(nlh);
+		len = nlmsg_len(nlh);
+		switch (query->path_use) {
+		case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+			mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+			break;
+
+		case LS_RESOLVE_PATH_USE_ALL:
+		case LS_RESOLVE_PATH_USE_GMP:
+		default:
+			mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+				IB_PATH_BIDIRECTIONAL;
+			break;
+		}
+		nla_for_each_attr(curr, head, len, rem) {
+			if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+				rec = nla_data(curr);
+				/*
+				 * Get the first one. In the future, we may
+				 * need to get up to 6 pathrecords.
+				 */
+				if ((rec->flags & mask) == mask) {
+					mad = query->mad_buf->mad;
+					mad->mad_hdr.method |=
+						IB_MGMT_METHOD_RESP;
+					memcpy(mad->data, rec->path_rec,
+					       sizeof(rec->path_rec));
+					status = 0;
+					break;
+				}
+			}
+		}
+		query->callback(query, status, mad);
+	}
+
+	mad_send_wc.send_buf = query->mad_buf;
+	mad_send_wc.status = IB_WC_SUCCESS;
+	send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+	unsigned long flags;
+	struct ib_sa_query *query;
+	unsigned long delay;
+	struct ib_mad_send_wc mad_send_wc;
+	int ret;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	while (!list_empty(&ib_nl_request_list)) {
+		query = list_entry(ib_nl_request_list.next,
+				   struct ib_sa_query, list);
+
+		if (time_after(query->timeout, jiffies)) {
+			delay = query->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+			break;
+		}
+
+		list_del(&query->list);
+		ib_sa_disable_local_svc(query);
+		/* Hold the lock to protect against query cancellation */
+		if (ib_sa_query_cancelled(query))
+			ret = -1;
+		else
+			ret = ib_post_send_mad(query->mad_buf, NULL);
+		if (ret) {
+			mad_send_wc.send_buf = query->mad_buf;
+			mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+			spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+			send_handler(query->port->agent, &mad_send_wc);
+			spin_lock_irqsave(&ib_nl_request_lock, flags);
+		}
+	}
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+static int ib_nl_handle_set_timeout(struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+	int timeout, delta, abs_delta;
+	const struct nlattr *attr;
+	unsigned long flags;
+	struct ib_sa_query *query;
+	long delay = 0;
+	struct nlattr *tb[LS_NLA_TYPE_MAX];
+	int ret;
+
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+			nlmsg_len(nlh), ib_nl_policy);
+	attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
+	if (ret || !attr)
+		goto settimeout_out;
+
+	timeout = *(int *) nla_data(attr);
+	if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+	if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+	delta = timeout - sa_local_svc_timeout_ms;
+	if (delta < 0)
+		abs_delta = -delta;
+	else
+		abs_delta = delta;
+
+	if (delta != 0) {
+		spin_lock_irqsave(&ib_nl_request_lock, flags);
+		sa_local_svc_timeout_ms = timeout;
+		list_for_each_entry(query, &ib_nl_request_list, list) {
+			if (delta < 0 && abs_delta > query->timeout)
+				query->timeout = 0;
+			else
+				query->timeout += delta;
+
+			/* Get the new delay from the first entry */
+			if (!delay) {
+				delay = query->timeout - jiffies;
+				if (delay <= 0)
+					delay = 1;
+			}
+		}
+		if (delay)
+			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+					 (unsigned long)delay);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+	}
+
+settimeout_out:
+	return skb->len;
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+	struct nlattr *tb[LS_NLA_TYPE_MAX];
+	int ret;
+
+	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+		return 0;
+
+	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+			nlmsg_len(nlh), ib_nl_policy);
+	if (ret)
+		return 0;
+
+	return 1;
+}
+
+static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+	unsigned long flags;
+	struct ib_sa_query *query;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_send_wc mad_send_wc;
+	int found = 0;
+	int ret;
+
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+	list_for_each_entry(query, &ib_nl_request_list, list) {
+		/*
+		 * If the query is cancelled, let the timeout routine
+		 * take care of it.
+		 */
+		if (nlh->nlmsg_seq == query->seq) {
+			found = !ib_sa_query_cancelled(query);
+			if (found)
+				list_del(&query->list);
+			break;
+		}
+	}
+
+	if (!found) {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		goto resp_out;
+	}
+
+	send_buf = query->mad_buf;
+
+	if (!ib_nl_is_good_resolve_resp(nlh)) {
+		/* if the result is a failure, send out the packet via IB */
+		ib_sa_disable_local_svc(query);
+		ret = ib_post_send_mad(query->mad_buf, NULL);
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		if (ret) {
+			mad_send_wc.send_buf = send_buf;
+			mad_send_wc.status = IB_WC_GENERAL_ERR;
+			send_handler(query->port->agent, &mad_send_wc);
+		}
+	} else {
+		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+		ib_nl_process_good_resolve_rsp(query, nlh);
+	}
+
+resp_out:
+	return skb->len;
+}
+
+static struct ibnl_client_cbs ib_sa_cb_table[] = {
+	[RDMA_NL_LS_OP_RESOLVE] = {
+		.dump = ib_nl_handle_resolve_resp,
+		.module = THIS_MODULE },
+	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
+		.dump = ib_nl_handle_set_timeout,
+		.module = THIS_MODULE },
+};
+
 static void free_sm_ah(struct kref *kref)
 {
 	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -502,7 +960,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
 	mad_buf = query->mad_buf;
 	spin_unlock_irqrestore(&idr_lock, flags);
 
-	ib_cancel_mad(agent, mad_buf);
+	/*
+	 * If the query is still on the netlink request list, schedule
+	 * it to be cancelled by the timeout routine. Otherwise, it has been
+	 * sent to the MAD layer and has to be cancelled from there.
+	 */
+	if (!ib_nl_cancel_request(query))
+		ib_cancel_mad(agent, mad_buf);
 }
 EXPORT_SYMBOL(ib_sa_cancel_query);
 
@@ -639,6 +1103,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
 	query->mad_buf->context[0] = query;
 	query->id = id;
 
+	if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) {
+		if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
+			if (!ib_nl_make_request(query))
+				return id;
+		}
+		ib_sa_disable_local_svc(query);
+	}
+
 	ret = ib_post_send_mad(query->mad_buf, NULL);
 	if (ret) {
 		spin_lock_irqsave(&idr_lock, flags);
@@ -740,7 +1212,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
 	port  = &sa_dev->port[port_num - sa_dev->start_port];
 	agent = port->agent;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -767,6 +1239,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
 
 	*sa_query = &query->sa_query;
 
+	query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+	query->sa_query.mad_buf->context[1] = rec;
+
 	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
 	if (ret < 0)
 		goto err2;
@@ -862,7 +1337,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
 	    method != IB_SA_METHOD_DELETE)
 		return -EINVAL;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -954,7 +1429,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
 	port  = &sa_dev->port[port_num - sa_dev->start_port];
 	agent = port->agent;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -1051,7 +1526,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
 	port  = &sa_dev->port[port_num - sa_dev->start_port];
 	agent = port->agent;
 
-	query = kmalloc(sizeof *query, gfp_mask);
+	query = kzalloc(sizeof(*query), gfp_mask);
 	if (!query)
 		return -ENOMEM;
 
@@ -1221,9 +1696,9 @@ free:
 	return;
 }
 
-static void ib_sa_remove_one(struct ib_device *device)
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
 {
-	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_device *sa_dev = client_data;
 	int i;
 
 	if (!sa_dev)
@@ -1251,6 +1726,8 @@ static int __init ib_sa_init(void)
 
 	get_random_bytes(&tid, sizeof tid);
 
+	atomic_set(&ib_nl_sa_request_seq, 0);
+
 	ret = ib_register_client(&sa_client);
 	if (ret) {
 		printk(KERN_ERR "Couldn't register ib_sa client\n");
@@ -1263,7 +1740,25 @@ static int __init ib_sa_init(void)
 		goto err2;
 	}
 
+	ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
+	if (!ib_nl_wq) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+
+	if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS,
+			    ib_sa_cb_table)) {
+		pr_err("Failed to add netlink callback\n");
+		ret = -EINVAL;
+		goto err4;
+	}
+	INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
 	return 0;
+err4:
+	destroy_workqueue(ib_nl_wq);
+err3:
+	mcast_cleanup();
 err2:
 	ib_unregister_client(&sa_client);
 err1:
@@ -1272,6 +1767,10 @@ err1:
 
 static void __exit ib_sa_cleanup(void)
 {
+	ibnl_remove_client(RDMA_NL_LS);
+	cancel_delayed_work(&ib_nl_timed_work);
+	flush_workqueue(ib_nl_wq);
+	destroy_workqueue(ib_nl_wq);
 	mcast_cleanup();
 	ib_unregister_client(&sa_client);
 	idr_destroy(&query_idr);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 0b84a9cdf..34cdd74b0 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -457,29 +457,6 @@ static struct kobj_type port_type = {
 	.default_attrs = port_default_attrs
 };
 
-static void ib_device_release(struct device *device)
-{
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
-
-	kfree(dev->port_immutable);
-	kfree(dev);
-}
-
-static int ib_device_uevent(struct device *device,
-			    struct kobj_uevent_env *env)
-{
-	struct ib_device *dev = container_of(device, struct ib_device, dev);
-
-	if (add_uevent_var(env, "NAME=%s", dev->name))
-		return -ENOMEM;
-
-	/*
-	 * It would be nice to pass the node GUID with the event...
-	 */
-
-	return 0;
-}
-
 static struct attribute **
 alloc_group_attrs(ssize_t (*show)(struct ib_port *,
 				  struct port_attribute *, char *buf),
@@ -702,12 +679,6 @@ static struct device_attribute *ib_class_attributes[] = {
 	&dev_attr_node_desc
 };
 
-static struct class ib_class = {
-	.name    = "infiniband",
-	.dev_release = ib_device_release,
-	.dev_uevent = ib_device_uevent,
-};
-
 /* Show a given an attribute in the statistics group */
 static ssize_t show_protocol_stat(const struct device *device,
 			    struct device_attribute *attr, char *buf,
@@ -846,14 +817,12 @@ int ib_device_register_sysfs(struct ib_device *device,
 	int ret;
 	int i;
 
-	class_dev->class      = &ib_class;
-	class_dev->parent     = device->dma_device;
-	dev_set_name(class_dev, "%s", device->name);
-	dev_set_drvdata(class_dev, device);
-
-	INIT_LIST_HEAD(&device->port_list);
+	device->dev.parent = device->dma_device;
+	ret = dev_set_name(class_dev, "%s", device->name);
+	if (ret)
+		return ret;
 
-	ret = device_register(class_dev);
+	ret = device_add(class_dev);
 	if (ret)
 		goto err;
 
@@ -916,13 +885,3 @@ void ib_device_unregister_sysfs(struct ib_device *device)
 
 	device_unregister(&device->dev);
 }
-
-int ib_sysfs_setup(void)
-{
-	return class_register(&ib_class);
-}
-
-void ib_sysfs_cleanup(void)
-{
-	class_unregister(&ib_class);
-}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 009481073..6b4e8a008 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -109,7 +109,7 @@ enum {
 #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
 
 static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client ucm_client = {
 	.name   = "ucm",
@@ -658,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
 	if (result)
 		goto out;
 
-	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
-			      NULL);
+	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
 out:
 	ib_ucm_ctx_put(ctx);
 	return result;
@@ -1310,9 +1309,9 @@ err:
 	return;
 }
 
-static void ib_ucm_remove_one(struct ib_device *device)
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
 {
-	struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+	struct ib_ucm_device *ucm_dev = client_data;
 
 	if (!ucm_dev)
 		return;
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 29b21213e..30467d10d 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -74,6 +74,7 @@ struct ucma_file {
 	struct list_head	ctx_list;
 	struct list_head	event_list;
 	wait_queue_head_t	poll_wait;
+	struct workqueue_struct	*close_wq;
 };
 
 struct ucma_context {
@@ -89,6 +90,13 @@ struct ucma_context {
 
 	struct list_head	list;
 	struct list_head	mc_list;
+	/* mark that device is in process of destroying the internal HW
+	 * resources, protected by the global mut
+	 */
+	int			closing;
+	/* sync between removal event and id destroy, protected by file mut */
+	int			destroying;
+	struct work_struct	close_work;
 };
 
 struct ucma_multicast {
@@ -107,6 +115,7 @@ struct ucma_event {
 	struct list_head	list;
 	struct rdma_cm_id	*cm_id;
 	struct rdma_ucm_event_resp resp;
+	struct work_struct	close_work;
 };
 
 static DEFINE_MUTEX(mut);
@@ -132,8 +141,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
 
 	mutex_lock(&mut);
 	ctx = _ucma_find_context(id, file);
-	if (!IS_ERR(ctx))
-		atomic_inc(&ctx->ref);
+	if (!IS_ERR(ctx)) {
+		if (ctx->closing)
+			ctx = ERR_PTR(-EIO);
+		else
+			atomic_inc(&ctx->ref);
+	}
 	mutex_unlock(&mut);
 	return ctx;
 }
@@ -144,6 +157,28 @@ static void ucma_put_ctx(struct ucma_context *ctx)
 		complete(&ctx->comp);
 }
 
+static void ucma_close_event_id(struct work_struct *work)
+{
+	struct ucma_event *uevent_close =  container_of(work, struct ucma_event, close_work);
+
+	rdma_destroy_id(uevent_close->cm_id);
+	kfree(uevent_close);
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+	struct ucma_context *ctx =  container_of(work, struct ucma_context, close_work);
+
+	/* once all inflight tasks are finished, we close all underlying
+	 * resources. The context is still alive till its explicit destryoing
+	 * by its creator.
+	 */
+	ucma_put_ctx(ctx);
+	wait_for_completion(&ctx->comp);
+	/* No new events will be generated after destroying the id. */
+	rdma_destroy_id(ctx->cm_id);
+}
+
 static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
 {
 	struct ucma_context *ctx;
@@ -152,6 +187,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
 	if (!ctx)
 		return NULL;
 
+	INIT_WORK(&ctx->close_work, ucma_close_id);
 	atomic_set(&ctx->ref, 1);
 	init_completion(&ctx->comp);
 	INIT_LIST_HEAD(&ctx->mc_list);
@@ -242,6 +278,44 @@ static void ucma_set_event_context(struct ucma_context *ctx,
 	}
 }
 
+/* Called with file->mut locked for the relevant context. */
+static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
+{
+	struct ucma_context *ctx = cm_id->context;
+	struct ucma_event *con_req_eve;
+	int event_found = 0;
+
+	if (ctx->destroying)
+		return;
+
+	/* only if context is pointing to cm_id that it owns it and can be
+	 * queued to be closed, otherwise that cm_id is an inflight one that
+	 * is part of that context event list pending to be detached and
+	 * reattached to its new context as part of ucma_get_event,
+	 * handled separately below.
+	 */
+	if (ctx->cm_id == cm_id) {
+		mutex_lock(&mut);
+		ctx->closing = 1;
+		mutex_unlock(&mut);
+		queue_work(ctx->file->close_wq, &ctx->close_work);
+		return;
+	}
+
+	list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
+		if (con_req_eve->cm_id == cm_id &&
+		    con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+			list_del(&con_req_eve->list);
+			INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
+			queue_work(ctx->file->close_wq, &con_req_eve->close_work);
+			event_found = 1;
+			break;
+		}
+	}
+	if (!event_found)
+		printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n");
+}
+
 static int ucma_event_handler(struct rdma_cm_id *cm_id,
 			      struct rdma_cm_event *event)
 {
@@ -276,14 +350,21 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
 		 * We ignore events for new connections until userspace has set
 		 * their context.  This can only happen if an error occurs on a
 		 * new connection before the user accepts it.  This is okay,
-		 * since the accept will just fail later.
+		 * since the accept will just fail later. However, we do need
+		 * to release the underlying HW resources in case of a device
+		 * removal event.
 		 */
+		if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+			ucma_removal_event_handler(cm_id);
+
 		kfree(uevent);
 		goto out;
 	}
 
 	list_add_tail(&uevent->list, &ctx->file->event_list);
 	wake_up_interruptible(&ctx->file->poll_wait);
+	if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+		ucma_removal_event_handler(cm_id);
 out:
 	mutex_unlock(&ctx->file->mut);
 	return ret;
@@ -442,9 +523,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
 }
 
 /*
- * We cannot hold file->mut when calling rdma_destroy_id() or we can
- * deadlock.  We also acquire file->mut in ucma_event_handler(), and
- * rdma_destroy_id() will wait until all callbacks have completed.
+ * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
+ * this point, no new events will be reported from the hardware. However, we
+ * still need to cleanup the UCMA context for this ID. Specifically, there
+ * might be events that have not yet been consumed by the user space software.
+ * These might include pending connect requests which we have not completed
+ * processing.  We cannot call rdma_destroy_id while holding the lock of the
+ * context (file->mut), as it might cause a deadlock. We therefore extract all
+ * relevant events from the context pending events list while holding the
+ * mutex. After that we release them as needed.
  */
 static int ucma_free_ctx(struct ucma_context *ctx)
 {
@@ -452,8 +539,6 @@ static int ucma_free_ctx(struct ucma_context *ctx)
 	struct ucma_event *uevent, *tmp;
 	LIST_HEAD(list);
 
-	/* No new events will be generated after destroying the id. */
-	rdma_destroy_id(ctx->cm_id);
 
 	ucma_cleanup_multicast(ctx);
 
@@ -501,10 +586,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
-	ucma_put_ctx(ctx);
-	wait_for_completion(&ctx->comp);
-	resp.events_reported = ucma_free_ctx(ctx);
+	mutex_lock(&ctx->file->mut);
+	ctx->destroying = 1;
+	mutex_unlock(&ctx->file->mut);
+
+	flush_workqueue(ctx->file->close_wq);
+	/* At this point it's guaranteed that there is no inflight
+	 * closing task */
+	mutex_lock(&mut);
+	if (!ctx->closing) {
+		mutex_unlock(&mut);
+		ucma_put_ctx(ctx);
+		wait_for_completion(&ctx->comp);
+		rdma_destroy_id(ctx->cm_id);
+	} else {
+		mutex_unlock(&mut);
+	}
 
+	resp.events_reported = ucma_free_ctx(ctx);
 	if (copy_to_user((void __user *)(unsigned long)cmd.response,
 			 &resp, sizeof(resp)))
 		ret = -EFAULT;
@@ -1321,10 +1420,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
 		mc = ERR_PTR(-ENOENT);
 	else if (mc->ctx->file != file)
 		mc = ERR_PTR(-EINVAL);
-	else {
+	else if (!atomic_inc_not_zero(&mc->ctx->ref))
+		mc = ERR_PTR(-ENXIO);
+	else
 		idr_remove(&multicast_idr, mc->id);
-		atomic_inc(&mc->ctx->ref);
-	}
 	mutex_unlock(&mut);
 
 	if (IS_ERR(mc)) {
@@ -1525,6 +1624,12 @@ static int ucma_open(struct inode *inode, struct file *filp)
 	if (!file)
 		return -ENOMEM;
 
+	file->close_wq = create_singlethread_workqueue("ucma_close_id");
+	if (!file->close_wq) {
+		kfree(file);
+		return -ENOMEM;
+	}
+
 	INIT_LIST_HEAD(&file->event_list);
 	INIT_LIST_HEAD(&file->ctx_list);
 	init_waitqueue_head(&file->poll_wait);
@@ -1543,16 +1648,34 @@ static int ucma_close(struct inode *inode, struct file *filp)
 
 	mutex_lock(&file->mut);
 	list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+		ctx->destroying = 1;
 		mutex_unlock(&file->mut);
 
 		mutex_lock(&mut);
 		idr_remove(&ctx_idr, ctx->id);
 		mutex_unlock(&mut);
 
+		flush_workqueue(file->close_wq);
+		/* At that step once ctx was marked as destroying and workqueue
+		 * was flushed we are safe from any inflights handlers that
+		 * might put other closing task.
+		 */
+		mutex_lock(&mut);
+		if (!ctx->closing) {
+			mutex_unlock(&mut);
+			/* rdma_destroy_id ensures that no event handlers are
+			 * inflight for that id before releasing it.
+			 */
+			rdma_destroy_id(ctx->cm_id);
+		} else {
+			mutex_unlock(&mut);
+		}
+
 		ucma_free_ctx(ctx);
 		mutex_lock(&file->mut);
 	}
 	mutex_unlock(&file->mut);
+	destroy_workqueue(file->close_wq);
 	kfree(file);
 	return 0;
 }
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 35567fffa..57f281f8d 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -133,7 +133,7 @@ static DEFINE_SPINLOCK(port_lock);
 static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
 
 static void ib_umad_add_one(struct ib_device *device);
-static void ib_umad_remove_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
 
 static void ib_umad_release_dev(struct kobject *kobj)
 {
@@ -1322,9 +1322,9 @@ free:
 	kobject_put(&umad_dev->kobj);
 }
 
-static void ib_umad_remove_one(struct ib_device *device)
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
 {
-	struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+	struct ib_umad_device *umad_dev = client_data;
 	int i;
 
 	if (!umad_dev)
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 65cbfcc92..3863d33c2 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -89,12 +89,16 @@ struct ib_uverbs_device {
 	int					num_comp_vectors;
 	struct completion			comp;
 	struct device			       *dev;
-	struct ib_device		       *ib_dev;
+	struct ib_device	__rcu	       *ib_dev;
 	int					devnum;
 	struct cdev			        cdev;
 	struct rb_root				xrcd_tree;
 	struct mutex				xrcd_tree_mutex;
 	struct kobject				kobj;
+	struct srcu_struct			disassociate_srcu;
+	struct mutex				lists_mutex; /* protect lists */
+	struct list_head			uverbs_file_list;
+	struct list_head			uverbs_events_file_list;
 };
 
 struct ib_uverbs_event_file {
@@ -106,6 +110,7 @@ struct ib_uverbs_event_file {
 	wait_queue_head_t			poll_wait;
 	struct fasync_struct		       *async_queue;
 	struct list_head			event_list;
+	struct list_head			list;
 };
 
 struct ib_uverbs_file {
@@ -115,6 +120,8 @@ struct ib_uverbs_file {
 	struct ib_ucontext		       *ucontext;
 	struct ib_event_handler			event_handler;
 	struct ib_uverbs_event_file	       *async_file;
+	struct list_head			list;
+	int					is_closed;
 };
 
 struct ib_uverbs_event {
@@ -178,7 +185,9 @@ extern struct idr ib_uverbs_rule_idr;
 void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
 
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					struct ib_device *ib_dev,
 					int is_async);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
 struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
 
 void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
@@ -213,6 +222,7 @@ struct ib_uverbs_flow_spec {
 
 #define IB_UVERBS_DECLARE_CMD(name)					\
 	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
+				 struct ib_device *ib_dev,              \
 				 const char __user *buf, int in_len,	\
 				 int out_len)
 
@@ -254,6 +264,7 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
 
 #define IB_UVERBS_DECLARE_EX_CMD(name)				\
 	int ib_uverbs_ex_##name(struct ib_uverbs_file *file,	\
+				struct ib_device *ib_dev,		\
 				struct ib_udata *ucore,		\
 				struct ib_udata *uhw)
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a6ca83b31..be4cb9f04 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -282,13 +282,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)
 }
 
 ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      const char __user *buf,
 			      int in_len, int out_len)
 {
 	struct ib_uverbs_get_context      cmd;
 	struct ib_uverbs_get_context_resp resp;
 	struct ib_udata                   udata;
-	struct ib_device                 *ibdev = file->device->ib_dev;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	struct ib_device_attr		  dev_attr;
 #endif
@@ -313,13 +313,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 		   (unsigned long) cmd.response + sizeof resp,
 		   in_len - sizeof cmd, out_len - sizeof resp);
 
-	ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+	ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
 	if (IS_ERR(ucontext)) {
 		ret = PTR_ERR(ucontext);
 		goto err;
 	}
 
-	ucontext->device = ibdev;
+	ucontext->device = ib_dev;
 	INIT_LIST_HEAD(&ucontext->pd_list);
 	INIT_LIST_HEAD(&ucontext->mr_list);
 	INIT_LIST_HEAD(&ucontext->mw_list);
@@ -340,7 +340,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	ucontext->odp_mrs_count = 0;
 	INIT_LIST_HEAD(&ucontext->no_private_counters);
 
-	ret = ib_query_device(ibdev, &dev_attr);
+	ret = ib_query_device(ib_dev, &dev_attr);
 	if (ret)
 		goto err_free;
 	if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
@@ -355,7 +355,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 		goto err_free;
 	resp.async_fd = ret;
 
-	filp = ib_uverbs_alloc_event_file(file, 1);
+	filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);
 	if (IS_ERR(filp)) {
 		ret = PTR_ERR(filp);
 		goto err_fd;
@@ -367,16 +367,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 		goto err_file;
 	}
 
-	file->async_file = filp->private_data;
-
-	INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
-			      ib_uverbs_event_handler);
-	ret = ib_register_event_handler(&file->event_handler);
-	if (ret)
-		goto err_file;
-
-	kref_get(&file->async_file->ref);
-	kref_get(&file->ref);
 	file->ucontext = ucontext;
 
 	fd_install(resp.async_fd, filp);
@@ -386,6 +376,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	return in_len;
 
 err_file:
+	ib_uverbs_free_async_event_file(file);
 	fput(filp);
 
 err_fd:
@@ -393,7 +384,7 @@ err_fd:
 
 err_free:
 	put_pid(ucontext->tgid);
-	ibdev->dealloc_ucontext(ucontext);
+	ib_dev->dealloc_ucontext(ucontext);
 
 err:
 	mutex_unlock(&file->mutex);
@@ -401,11 +392,12 @@ err:
 }
 
 static void copy_query_dev_fields(struct ib_uverbs_file *file,
+				  struct ib_device *ib_dev,
 				  struct ib_uverbs_query_device_resp *resp,
 				  struct ib_device_attr *attr)
 {
 	resp->fw_ver		= attr->fw_ver;
-	resp->node_guid		= file->device->ib_dev->node_guid;
+	resp->node_guid		= ib_dev->node_guid;
 	resp->sys_image_guid	= attr->sys_image_guid;
 	resp->max_mr_size	= attr->max_mr_size;
 	resp->page_size_cap	= attr->page_size_cap;
@@ -443,10 +435,11 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file,
 	resp->max_srq_sge		= attr->max_srq_sge;
 	resp->max_pkeys			= attr->max_pkeys;
 	resp->local_ca_ack_delay	= attr->local_ca_ack_delay;
-	resp->phys_port_cnt		= file->device->ib_dev->phys_port_cnt;
+	resp->phys_port_cnt		= ib_dev->phys_port_cnt;
 }
 
 ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+			       struct ib_device *ib_dev,
 			       const char __user *buf,
 			       int in_len, int out_len)
 {
@@ -461,12 +454,12 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
 	if (copy_from_user(&cmd, buf, sizeof cmd))
 		return -EFAULT;
 
-	ret = ib_query_device(file->device->ib_dev, &attr);
+	ret = ib_query_device(ib_dev, &attr);
 	if (ret)
 		return ret;
 
 	memset(&resp, 0, sizeof resp);
-	copy_query_dev_fields(file, &resp, &attr);
+	copy_query_dev_fields(file, ib_dev, &resp, &attr);
 
 	if (copy_to_user((void __user *) (unsigned long) cmd.response,
 			 &resp, sizeof resp))
@@ -476,6 +469,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf,
 			     int in_len, int out_len)
 {
@@ -490,7 +484,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 	if (copy_from_user(&cmd, buf, sizeof cmd))
 		return -EFAULT;
 
-	ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+	ret = ib_query_port(ib_dev, cmd.port_num, &attr);
 	if (ret)
 		return ret;
 
@@ -515,7 +509,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 	resp.active_width    = attr.active_width;
 	resp.active_speed    = attr.active_speed;
 	resp.phys_state      = attr.phys_state;
-	resp.link_layer      = rdma_port_get_link_layer(file->device->ib_dev,
+	resp.link_layer      = rdma_port_get_link_layer(ib_dev,
 							cmd.port_num);
 
 	if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -526,6 +520,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
 			   const char __user *buf,
 			   int in_len, int out_len)
 {
@@ -553,15 +548,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
 	init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
 	down_write(&uobj->mutex);
 
-	pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
-					    file->ucontext, &udata);
+	pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
 	if (IS_ERR(pd)) {
 		ret = PTR_ERR(pd);
 		goto err;
 	}
 
-	pd->device  = file->device->ib_dev;
+	pd->device  = ib_dev;
 	pd->uobject = uobj;
+	pd->local_mr = NULL;
 	atomic_set(&pd->usecnt, 0);
 
 	uobj->object = pd;
@@ -600,11 +595,13 @@ err:
 }
 
 ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf,
 			     int in_len, int out_len)
 {
 	struct ib_uverbs_dealloc_pd cmd;
 	struct ib_uobject          *uobj;
+	struct ib_pd		   *pd;
 	int                         ret;
 
 	if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -613,15 +610,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
 	uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
 	if (!uobj)
 		return -EINVAL;
+	pd = uobj->object;
 
-	ret = ib_dealloc_pd(uobj->object);
-	if (!ret)
-		uobj->live = 0;
-
-	put_uobj_write(uobj);
+	if (atomic_read(&pd->usecnt)) {
+		ret = -EBUSY;
+		goto err_put;
+	}
 
+	ret = pd->device->dealloc_pd(uobj->object);
+	WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
 	if (ret)
-		return ret;
+		goto err_put;
+
+	uobj->live = 0;
+	put_uobj_write(uobj);
 
 	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
 
@@ -632,6 +634,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
 	put_uobj(uobj);
 
 	return in_len;
+
+err_put:
+	put_uobj_write(uobj);
+	return ret;
 }
 
 struct xrcd_table_entry {
@@ -720,6 +726,7 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,
 }
 
 ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -778,15 +785,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
 	down_write(&obj->uobject.mutex);
 
 	if (!xrcd) {
-		xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
-							file->ucontext, &udata);
+		xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);
 		if (IS_ERR(xrcd)) {
 			ret = PTR_ERR(xrcd);
 			goto err;
 		}
 
 		xrcd->inode   = inode;
-		xrcd->device  = file->device->ib_dev;
+		xrcd->device  = ib_dev;
 		atomic_set(&xrcd->usecnt, 0);
 		mutex_init(&xrcd->tgt_qp_mutex);
 		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
@@ -857,6 +863,7 @@ err_tree_mutex_unlock:
 }
 
 ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -934,6 +941,7 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
 }
 
 ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+			 struct ib_device *ib_dev,
 			 const char __user *buf, int in_len,
 			 int out_len)
 {
@@ -1043,6 +1051,7 @@ err_free:
 }
 
 ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
 			   const char __user *buf, int in_len,
 			   int out_len)
 {
@@ -1136,6 +1145,7 @@ put_uobjs:
 }
 
 ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
 			   const char __user *buf, int in_len,
 			   int out_len)
 {
@@ -1174,8 +1184,9 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
-			 const char __user *buf, int in_len,
-			 int out_len)
+			   struct ib_device *ib_dev,
+			   const char __user *buf, int in_len,
+			   int out_len)
 {
 	struct ib_uverbs_alloc_mw      cmd;
 	struct ib_uverbs_alloc_mw_resp resp;
@@ -1256,8 +1267,9 @@ err_free:
 }
 
 ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
-			   const char __user *buf, int in_len,
-			   int out_len)
+			     struct ib_device *ib_dev,
+			     const char __user *buf, int in_len,
+			     int out_len)
 {
 	struct ib_uverbs_dealloc_mw cmd;
 	struct ib_mw               *mw;
@@ -1294,6 +1306,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+				      struct ib_device *ib_dev,
 				      const char __user *buf, int in_len,
 				      int out_len)
 {
@@ -1313,7 +1326,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
 		return ret;
 	resp.fd = ret;
 
-	filp = ib_uverbs_alloc_event_file(file, 0);
+	filp = ib_uverbs_alloc_event_file(file, ib_dev, 0);
 	if (IS_ERR(filp)) {
 		put_unused_fd(resp.fd);
 		return PTR_ERR(filp);
@@ -1331,6 +1344,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
 }
 
 static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+					struct ib_device *ib_dev,
 				       struct ib_udata *ucore,
 				       struct ib_udata *uhw,
 				       struct ib_uverbs_ex_create_cq *cmd,
@@ -1379,14 +1393,14 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
 	if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
 		attr.flags = cmd->flags;
 
-	cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
+	cq = ib_dev->create_cq(ib_dev, &attr,
 					     file->ucontext, uhw);
 	if (IS_ERR(cq)) {
 		ret = PTR_ERR(cq);
 		goto err_file;
 	}
 
-	cq->device        = file->device->ib_dev;
+	cq->device        = ib_dev;
 	cq->uobject       = &obj->uobject;
 	cq->comp_handler  = ib_uverbs_comp_handler;
 	cq->event_handler = ib_uverbs_cq_event_handler;
@@ -1447,6 +1461,7 @@ static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -1475,7 +1490,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
 	cmd_ex.comp_vector = cmd.comp_vector;
 	cmd_ex.comp_channel = cmd.comp_channel;
 
-	obj = create_cq(file, &ucore, &uhw, &cmd_ex,
+	obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,
 			offsetof(typeof(cmd_ex), comp_channel) +
 			sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
 			NULL);
@@ -1498,6 +1513,7 @@ static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
 }
 
 int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
+			 struct ib_device *ib_dev,
 			   struct ib_udata *ucore,
 			   struct ib_udata *uhw)
 {
@@ -1523,7 +1539,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
 			     sizeof(resp.response_length)))
 		return -ENOSPC;
 
-	obj = create_cq(file, ucore, uhw, &cmd,
+	obj = create_cq(file, ib_dev, ucore, uhw, &cmd,
 			min(ucore->inlen, sizeof(cmd)),
 			ib_uverbs_ex_create_cq_cb, NULL);
 
@@ -1534,6 +1550,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -1597,6 +1614,7 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
 }
 
 ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+			  struct ib_device *ib_dev,
 			  const char __user *buf, int in_len,
 			  int out_len)
 {
@@ -1648,6 +1666,7 @@ out_put:
 }
 
 ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+				struct ib_device *ib_dev,
 				const char __user *buf, int in_len,
 				int out_len)
 {
@@ -1670,6 +1689,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -1722,6 +1742,7 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -1917,6 +1938,7 @@ err_put:
 }
 
 ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+			  struct ib_device *ib_dev,
 			  const char __user *buf, int in_len, int out_len)
 {
 	struct ib_uverbs_open_qp        cmd;
@@ -2011,6 +2033,7 @@ err_put:
 }
 
 ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+			   struct ib_device *ib_dev,
 			   const char __user *buf, int in_len,
 			   int out_len)
 {
@@ -2125,6 +2148,7 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
 }
 
 ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -2221,6 +2245,7 @@ out:
 }
 
 ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -2279,6 +2304,7 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -2531,6 +2557,7 @@ err:
 }
 
 ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -2580,6 +2607,7 @@ out:
 }
 
 ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+				struct ib_device *ib_dev,
 				const char __user *buf, int in_len,
 				int out_len)
 {
@@ -2629,6 +2657,7 @@ out:
 }
 
 ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf, int in_len,
 			    int out_len)
 {
@@ -2721,6 +2750,7 @@ err:
 }
 
 ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len, int out_len)
 {
 	struct ib_uverbs_destroy_ah cmd;
@@ -2757,6 +2787,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+			       struct ib_device *ib_dev,
 			       const char __user *buf, int in_len,
 			       int out_len)
 {
@@ -2804,6 +2835,7 @@ out_put:
 }
 
 ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+			       struct ib_device *ib_dev,
 			       const char __user *buf, int in_len,
 			       int out_len)
 {
@@ -2884,6 +2916,7 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
 }
 
 int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     struct ib_udata *ucore,
 			     struct ib_udata *uhw)
 {
@@ -3044,6 +3077,7 @@ err_free_attr:
 }
 
 int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      struct ib_udata *ucore,
 			      struct ib_udata *uhw)
 {
@@ -3086,6 +3120,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
 }
 
 static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+				struct ib_device *ib_dev,
 				struct ib_uverbs_create_xsrq *cmd,
 				struct ib_udata *udata)
 {
@@ -3219,6 +3254,7 @@ err:
 }
 
 ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -3246,7 +3282,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
 		   (unsigned long) cmd.response + sizeof resp,
 		   in_len - sizeof cmd, out_len - sizeof resp);
 
-	ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+	ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
 	if (ret)
 		return ret;
 
@@ -3254,6 +3290,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      const char __user *buf, int in_len, int out_len)
 {
 	struct ib_uverbs_create_xsrq     cmd;
@@ -3271,7 +3308,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
 		   (unsigned long) cmd.response + sizeof resp,
 		   in_len - sizeof cmd, out_len - sizeof resp);
 
-	ret = __uverbs_create_xsrq(file, &cmd, &udata);
+	ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
 	if (ret)
 		return ret;
 
@@ -3279,6 +3316,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+			     struct ib_device *ib_dev,
 			     const char __user *buf, int in_len,
 			     int out_len)
 {
@@ -3309,6 +3347,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+			    struct ib_device *ib_dev,
 			    const char __user *buf,
 			    int in_len, int out_len)
 {
@@ -3349,6 +3388,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      const char __user *buf, int in_len,
 			      int out_len)
 {
@@ -3406,16 +3446,15 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 }
 
 int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+			      struct ib_device *ib_dev,
 			      struct ib_udata *ucore,
 			      struct ib_udata *uhw)
 {
 	struct ib_uverbs_ex_query_device_resp resp;
 	struct ib_uverbs_ex_query_device  cmd;
 	struct ib_device_attr attr;
-	struct ib_device *device;
 	int err;
 
-	device = file->device->ib_dev;
 	if (ucore->inlen < sizeof(cmd))
 		return -EINVAL;
 
@@ -3436,11 +3475,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
 
 	memset(&attr, 0, sizeof(attr));
 
-	err = device->query_device(device, &attr, uhw);
+	err = ib_dev->query_device(ib_dev, &attr, uhw);
 	if (err)
 		return err;
 
-	copy_query_dev_fields(file, &resp.base, &attr);
+	copy_query_dev_fields(file, ib_dev, &resp.base, &attr);
 	resp.comp_mask = 0;
 
 	if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 15f4126a5..c29a660c7 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -79,6 +79,7 @@ static DEFINE_SPINLOCK(map_lock);
 static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
 
 static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+				     struct ib_device *ib_dev,
 				     const char __user *buf, int in_len,
 				     int out_len) = {
 	[IB_USER_VERBS_CMD_GET_CONTEXT]		= ib_uverbs_get_context,
@@ -119,6 +120,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 };
 
 static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+				    struct ib_device *ib_dev,
 				    struct ib_udata *ucore,
 				    struct ib_udata *uhw) = {
 	[IB_USER_VERBS_EX_CMD_CREATE_FLOW]	= ib_uverbs_ex_create_flow,
@@ -128,13 +130,14 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
-static void ib_uverbs_remove_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
 
 static void ib_uverbs_release_dev(struct kobject *kobj)
 {
 	struct ib_uverbs_device *dev =
 		container_of(kobj, struct ib_uverbs_device, kobj);
 
+	cleanup_srcu_struct(&dev->disassociate_srcu);
 	kfree(dev);
 }
 
@@ -205,9 +208,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 {
 	struct ib_uobject *uobj, *tmp;
 
-	if (!context)
-		return 0;
-
 	context->closing = 1;
 
 	list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
@@ -316,8 +316,16 @@ static void ib_uverbs_release_file(struct kref *ref)
 {
 	struct ib_uverbs_file *file =
 		container_of(ref, struct ib_uverbs_file, ref);
+	struct ib_device *ib_dev;
+	int srcu_key;
+
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (ib_dev && !ib_dev->disassociate_ucontext)
+		module_put(ib_dev->owner);
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 
-	module_put(file->device->ib_dev->owner);
 	if (atomic_dec_and_test(&file->device->refcount))
 		ib_uverbs_comp_dev(file->device);
 
@@ -341,9 +349,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
 			return -EAGAIN;
 
 		if (wait_event_interruptible(file->poll_wait,
-					     !list_empty(&file->event_list)))
+					     (!list_empty(&file->event_list) ||
+			/* The barriers built into wait_event_interruptible()
+			 * and wake_up() guarentee this will see the null set
+			 * without using RCU
+			 */
+					     !file->uverbs_file->device->ib_dev)))
 			return -ERESTARTSYS;
 
+		/* If device was disassociated and no event exists set an error */
+		if (list_empty(&file->event_list) &&
+		    !file->uverbs_file->device->ib_dev)
+			return -EIO;
+
 		spin_lock_irq(&file->lock);
 	}
 
@@ -406,8 +424,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_event_file *file = filp->private_data;
 	struct ib_uverbs_event *entry, *tmp;
+	int closed_already = 0;
 
+	mutex_lock(&file->uverbs_file->device->lists_mutex);
 	spin_lock_irq(&file->lock);
+	closed_already = file->is_closed;
 	file->is_closed = 1;
 	list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
 		if (entry->counter)
@@ -415,11 +436,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
 		kfree(entry);
 	}
 	spin_unlock_irq(&file->lock);
-
-	if (file->is_async) {
-		ib_unregister_event_handler(&file->uverbs_file->event_handler);
-		kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+	if (!closed_already) {
+		list_del(&file->list);
+		if (file->is_async)
+			ib_unregister_event_handler(&file->uverbs_file->
+				event_handler);
 	}
+	mutex_unlock(&file->uverbs_file->device->lists_mutex);
+
+	kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
 	kref_put(&file->ref, ib_uverbs_release_event_file);
 
 	return 0;
@@ -551,13 +576,21 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
 				NULL, NULL);
 }
 
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+	kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+	file->async_file = NULL;
+}
+
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					struct ib_device	*ib_dev,
 					int is_async)
 {
 	struct ib_uverbs_event_file *ev_file;
 	struct file *filp;
+	int ret;
 
-	ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+	ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
 	if (!ev_file)
 		return ERR_PTR(-ENOMEM);
 
@@ -566,16 +599,47 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
 	INIT_LIST_HEAD(&ev_file->event_list);
 	init_waitqueue_head(&ev_file->poll_wait);
 	ev_file->uverbs_file = uverbs_file;
+	kref_get(&ev_file->uverbs_file->ref);
 	ev_file->async_queue = NULL;
-	ev_file->is_async    = is_async;
 	ev_file->is_closed   = 0;
 
 	filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
 				  ev_file, O_RDONLY);
 	if (IS_ERR(filp))
-		kfree(ev_file);
+		goto err_put_refs;
+
+	mutex_lock(&uverbs_file->device->lists_mutex);
+	list_add_tail(&ev_file->list,
+		      &uverbs_file->device->uverbs_events_file_list);
+	mutex_unlock(&uverbs_file->device->lists_mutex);
+
+	if (is_async) {
+		WARN_ON(uverbs_file->async_file);
+		uverbs_file->async_file = ev_file;
+		kref_get(&uverbs_file->async_file->ref);
+		INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
+				      ib_dev,
+				      ib_uverbs_event_handler);
+		ret = ib_register_event_handler(&uverbs_file->event_handler);
+		if (ret)
+			goto err_put_file;
+
+		/* At that point async file stuff was fully set */
+		ev_file->is_async = 1;
+	}
 
 	return filp;
+
+err_put_file:
+	fput(filp);
+	kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file);
+	uverbs_file->async_file = NULL;
+	return ERR_PTR(ret);
+
+err_put_refs:
+	kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
+	kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+	return filp;
 }
 
 /*
@@ -611,8 +675,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 			     size_t count, loff_t *pos)
 {
 	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_device *ib_dev;
 	struct ib_uverbs_cmd_hdr hdr;
 	__u32 flags;
+	int srcu_key;
+	ssize_t ret;
 
 	if (count < sizeof hdr)
 		return -EINVAL;
@@ -620,6 +687,14 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 	if (copy_from_user(&hdr, buf, sizeof hdr))
 		return -EFAULT;
 
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (!ib_dev) {
+		ret = -EIO;
+		goto out;
+	}
+
 	flags = (hdr.command &
 		 IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
 
@@ -627,26 +702,36 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 		__u32 command;
 
 		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
-					   IB_USER_VERBS_CMD_COMMAND_MASK))
-			return -EINVAL;
+					   IB_USER_VERBS_CMD_COMMAND_MASK)) {
+			ret = -EINVAL;
+			goto out;
+		}
 
 		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
 
 		if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
-		    !uverbs_cmd_table[command])
-			return -EINVAL;
+		    !uverbs_cmd_table[command]) {
+			ret = -EINVAL;
+			goto out;
+		}
 
 		if (!file->ucontext &&
-		    command != IB_USER_VERBS_CMD_GET_CONTEXT)
-			return -EINVAL;
+		    command != IB_USER_VERBS_CMD_GET_CONTEXT) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
-			return -ENOSYS;
+		if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) {
+			ret = -ENOSYS;
+			goto out;
+		}
 
-		if (hdr.in_words * 4 != count)
-			return -EINVAL;
+		if (hdr.in_words * 4 != count) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		return uverbs_cmd_table[command](file,
+		ret = uverbs_cmd_table[command](file, ib_dev,
 						 buf + sizeof(hdr),
 						 hdr.in_words * 4,
 						 hdr.out_words * 4);
@@ -657,51 +742,72 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 		struct ib_uverbs_ex_cmd_hdr ex_hdr;
 		struct ib_udata ucore;
 		struct ib_udata uhw;
-		int err;
 		size_t written_count = count;
 
 		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
-					   IB_USER_VERBS_CMD_COMMAND_MASK))
-			return -EINVAL;
+					   IB_USER_VERBS_CMD_COMMAND_MASK)) {
+			ret = -EINVAL;
+			goto out;
+		}
 
 		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
 
 		if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
-		    !uverbs_ex_cmd_table[command])
-			return -ENOSYS;
+		    !uverbs_ex_cmd_table[command]) {
+			ret = -ENOSYS;
+			goto out;
+		}
 
-		if (!file->ucontext)
-			return -EINVAL;
+		if (!file->ucontext) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
-			return -ENOSYS;
+		if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) {
+			ret = -ENOSYS;
+			goto out;
+		}
 
-		if (count < (sizeof(hdr) + sizeof(ex_hdr)))
-			return -EINVAL;
+		if (count < (sizeof(hdr) + sizeof(ex_hdr))) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
-			return -EFAULT;
+		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) {
+			ret = -EFAULT;
+			goto out;
+		}
 
 		count -= sizeof(hdr) + sizeof(ex_hdr);
 		buf += sizeof(hdr) + sizeof(ex_hdr);
 
-		if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
-			return -EINVAL;
+		if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) {
+			ret = -EINVAL;
+			goto out;
+		}
 
-		if (ex_hdr.cmd_hdr_reserved)
-			return -EINVAL;
+		if (ex_hdr.cmd_hdr_reserved) {
+			ret = -EINVAL;
+			goto out;
+		}
 
 		if (ex_hdr.response) {
-			if (!hdr.out_words && !ex_hdr.provider_out_words)
-				return -EINVAL;
+			if (!hdr.out_words && !ex_hdr.provider_out_words) {
+				ret = -EINVAL;
+				goto out;
+			}
 
 			if (!access_ok(VERIFY_WRITE,
 				       (void __user *) (unsigned long) ex_hdr.response,
-				       (hdr.out_words + ex_hdr.provider_out_words) * 8))
-				return -EFAULT;
+				       (hdr.out_words + ex_hdr.provider_out_words) * 8)) {
+				ret = -EFAULT;
+				goto out;
+			}
 		} else {
-			if (hdr.out_words || ex_hdr.provider_out_words)
-				return -EINVAL;
+			if (hdr.out_words || ex_hdr.provider_out_words) {
+				ret = -EINVAL;
+				goto out;
+			}
 		}
 
 		INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
@@ -713,27 +819,43 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 				       ex_hdr.provider_in_words * 8,
 				       ex_hdr.provider_out_words * 8);
 
-		err = uverbs_ex_cmd_table[command](file,
+		ret = uverbs_ex_cmd_table[command](file,
+						   ib_dev,
 						   &ucore,
 						   &uhw);
-
-		if (err)
-			return err;
-
-		return written_count;
+		if (!ret)
+			ret = written_count;
+	} else {
+		ret = -ENOSYS;
 	}
 
-	return -ENOSYS;
+out:
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+	return ret;
 }
 
 static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_device *ib_dev;
+	int ret = 0;
+	int srcu_key;
+
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (!ib_dev) {
+		ret = -EIO;
+		goto out;
+	}
 
 	if (!file->ucontext)
-		return -ENODEV;
+		ret = -ENODEV;
 	else
-		return file->device->ib_dev->mmap(file->ucontext, vma);
+		ret = ib_dev->mmap(file->ucontext, vma);
+out:
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+	return ret;
 }
 
 /*
@@ -750,21 +872,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_device *dev;
 	struct ib_uverbs_file *file;
+	struct ib_device *ib_dev;
 	int ret;
+	int module_dependent;
+	int srcu_key;
 
 	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
 	if (!atomic_inc_not_zero(&dev->refcount))
 		return -ENXIO;
 
-	if (!try_module_get(dev->ib_dev->owner)) {
-		ret = -ENODEV;
+	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+	mutex_lock(&dev->lists_mutex);
+	ib_dev = srcu_dereference(dev->ib_dev,
+				  &dev->disassociate_srcu);
+	if (!ib_dev) {
+		ret = -EIO;
 		goto err;
 	}
 
-	file = kmalloc(sizeof *file, GFP_KERNEL);
+	/* In case IB device supports disassociate ucontext, there is no hard
+	 * dependency between uverbs device and its low level device.
+	 */
+	module_dependent = !(ib_dev->disassociate_ucontext);
+
+	if (module_dependent) {
+		if (!try_module_get(ib_dev->owner)) {
+			ret = -ENODEV;
+			goto err;
+		}
+	}
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
 	if (!file) {
 		ret = -ENOMEM;
-		goto err_module;
+		if (module_dependent)
+			goto err_module;
+
+		goto err;
 	}
 
 	file->device	 = dev;
@@ -775,13 +919,18 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = file;
 	kobject_get(&dev->kobj);
+	list_add_tail(&file->list, &dev->uverbs_file_list);
+	mutex_unlock(&dev->lists_mutex);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
 	return nonseekable_open(inode, filp);
 
 err_module:
-	module_put(dev->ib_dev->owner);
+	module_put(ib_dev->owner);
 
 err:
+	mutex_unlock(&dev->lists_mutex);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 	if (atomic_dec_and_test(&dev->refcount))
 		ib_uverbs_comp_dev(dev);
 
@@ -792,8 +941,18 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_file *file = filp->private_data;
 	struct ib_uverbs_device *dev = file->device;
-
-	ib_uverbs_cleanup_ucontext(file, file->ucontext);
+	struct ib_ucontext *ucontext = NULL;
+
+	mutex_lock(&file->device->lists_mutex);
+	ucontext = file->ucontext;
+	file->ucontext = NULL;
+	if (!file->is_closed) {
+		list_del(&file->list);
+		file->is_closed = 1;
+	}
+	mutex_unlock(&file->device->lists_mutex);
+	if (ucontext)
+		ib_uverbs_cleanup_ucontext(file, ucontext);
 
 	if (file->async_file)
 		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
@@ -830,12 +989,21 @@ static struct ib_client uverbs_client = {
 static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
 			  char *buf)
 {
+	int ret = -ENODEV;
+	int srcu_key;
 	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+	struct ib_device *ib_dev;
 
 	if (!dev)
 		return -ENODEV;
 
-	return sprintf(buf, "%s\n", dev->ib_dev->name);
+	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+	if (ib_dev)
+		ret = sprintf(buf, "%s\n", ib_dev->name);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+	return ret;
 }
 static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
 
@@ -843,11 +1011,19 @@ static ssize_t show_dev_abi_version(struct device *device,
 				    struct device_attribute *attr, char *buf)
 {
 	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+	int ret = -ENODEV;
+	int srcu_key;
+	struct ib_device *ib_dev;
 
 	if (!dev)
 		return -ENODEV;
+	srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+	if (ib_dev)
+		ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
-	return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+	return ret;
 }
 static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
 
@@ -887,6 +1063,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	int devnum;
 	dev_t base;
 	struct ib_uverbs_device *uverbs_dev;
+	int ret;
 
 	if (!device->alloc_ucontext)
 		return;
@@ -895,11 +1072,20 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	if (!uverbs_dev)
 		return;
 
+	ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+	if (ret) {
+		kfree(uverbs_dev);
+		return;
+	}
+
 	atomic_set(&uverbs_dev->refcount, 1);
 	init_completion(&uverbs_dev->comp);
 	uverbs_dev->xrcd_tree = RB_ROOT;
 	mutex_init(&uverbs_dev->xrcd_tree_mutex);
 	kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+	mutex_init(&uverbs_dev->lists_mutex);
+	INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+	INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
 
 	spin_lock(&map_lock);
 	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -920,7 +1106,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	}
 	spin_unlock(&map_lock);
 
-	uverbs_dev->ib_dev           = device;
+	rcu_assign_pointer(uverbs_dev->ib_dev, device);
 	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
 
 	cdev_init(&uverbs_dev->cdev, NULL);
@@ -964,9 +1150,72 @@ err:
 	return;
 }
 
-static void ib_uverbs_remove_one(struct ib_device *device)
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+					struct ib_device *ib_dev)
 {
-	struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+	struct ib_uverbs_file *file;
+	struct ib_uverbs_event_file *event_file;
+	struct ib_event event;
+
+	/* Pending running commands to terminate */
+	synchronize_srcu(&uverbs_dev->disassociate_srcu);
+	event.event = IB_EVENT_DEVICE_FATAL;
+	event.element.port_num = 0;
+	event.device = ib_dev;
+
+	mutex_lock(&uverbs_dev->lists_mutex);
+	while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+		struct ib_ucontext *ucontext;
+
+		file = list_first_entry(&uverbs_dev->uverbs_file_list,
+					struct ib_uverbs_file, list);
+		file->is_closed = 1;
+		ucontext = file->ucontext;
+		list_del(&file->list);
+		file->ucontext = NULL;
+		kref_get(&file->ref);
+		mutex_unlock(&uverbs_dev->lists_mutex);
+		/* We must release the mutex before going ahead and calling
+		 * disassociate_ucontext. disassociate_ucontext might end up
+		 * indirectly calling uverbs_close, for example due to freeing
+		 * the resources (e.g mmput).
+		 */
+		ib_uverbs_event_handler(&file->event_handler, &event);
+		if (ucontext) {
+			ib_dev->disassociate_ucontext(ucontext);
+			ib_uverbs_cleanup_ucontext(file, ucontext);
+		}
+
+		mutex_lock(&uverbs_dev->lists_mutex);
+		kref_put(&file->ref, ib_uverbs_release_file);
+	}
+
+	while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
+		event_file = list_first_entry(&uverbs_dev->
+					      uverbs_events_file_list,
+					      struct ib_uverbs_event_file,
+					      list);
+		spin_lock_irq(&event_file->lock);
+		event_file->is_closed = 1;
+		spin_unlock_irq(&event_file->lock);
+
+		list_del(&event_file->list);
+		if (event_file->is_async) {
+			ib_unregister_event_handler(&event_file->uverbs_file->
+						    event_handler);
+			event_file->uverbs_file->event_handler.device = NULL;
+		}
+
+		wake_up_interruptible(&event_file->poll_wait);
+		kill_fasync(&event_file->async_queue, SIGIO, POLL_IN);
+	}
+	mutex_unlock(&uverbs_dev->lists_mutex);
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
+{
+	struct ib_uverbs_device *uverbs_dev = client_data;
+	int wait_clients = 1;
 
 	if (!uverbs_dev)
 		return;
@@ -980,9 +1229,27 @@ static void ib_uverbs_remove_one(struct ib_device *device)
 	else
 		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
 
+	if (device->disassociate_ucontext) {
+		/* We disassociate HW resources and immediately return.
+		 * Userspace will see a EIO errno for all future access.
+		 * Upon returning, ib_device may be freed internally and is not
+		 * valid any more.
+		 * uverbs_device is still available until all clients close
+		 * their files, then the uverbs device ref count will be zero
+		 * and its resources will be freed.
+		 * Note: At this point no more files can be opened since the
+		 * cdev was deleted, however active clients can still issue
+		 * commands and close their open files.
+		 */
+		rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+		ib_uverbs_free_hw_resources(uverbs_dev, device);
+		wait_clients = 0;
+	}
+
 	if (atomic_dec_and_test(&uverbs_dev->refcount))
 		ib_uverbs_comp_dev(uverbs_dev);
-	wait_for_completion(&uverbs_dev->comp);
+	if (wait_clients)
+		wait_for_completion(&uverbs_dev->comp);
 	kobject_put(&uverbs_dev->kobj);
 }
 
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index bac3fb406..e1f2c9887 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -213,28 +213,79 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
 
 /* Protection domains */
 
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
 struct ib_pd *ib_alloc_pd(struct ib_device *device)
 {
 	struct ib_pd *pd;
+	struct ib_device_attr devattr;
+	int rc;
+
+	rc = ib_query_device(device, &devattr);
+	if (rc)
+		return ERR_PTR(rc);
 
 	pd = device->alloc_pd(device, NULL, NULL);
+	if (IS_ERR(pd))
+		return pd;
+
+	pd->device = device;
+	pd->uobject = NULL;
+	pd->local_mr = NULL;
+	atomic_set(&pd->usecnt, 0);
+
+	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+		pd->local_dma_lkey = device->local_dma_lkey;
+	else {
+		struct ib_mr *mr;
+
+		mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(mr)) {
+			ib_dealloc_pd(pd);
+			return (struct ib_pd *)mr;
+		}
 
-	if (!IS_ERR(pd)) {
-		pd->device  = device;
-		pd->uobject = NULL;
-		atomic_set(&pd->usecnt, 0);
+		pd->local_mr = mr;
+		pd->local_dma_lkey = pd->local_mr->lkey;
 	}
-
 	return pd;
 }
 EXPORT_SYMBOL(ib_alloc_pd);
 
-int ib_dealloc_pd(struct ib_pd *pd)
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist.  The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+void ib_dealloc_pd(struct ib_pd *pd)
 {
-	if (atomic_read(&pd->usecnt))
-		return -EBUSY;
+	int ret;
+
+	if (pd->local_mr) {
+		ret = ib_dereg_mr(pd->local_mr);
+		WARN_ON(ret);
+		pd->local_mr = NULL;
+	}
+
+	/* uverbs manipulates usecnt with proper locking, while the kabi
+	   requires the caller to guarantee we can't race here. */
+	WARN_ON(atomic_read(&pd->usecnt));
 
-	return pd->device->dealloc_pd(pd);
+	/* Making delalloc_pd a void return is a WIP, no driver should return
+	   an error here. */
+	ret = pd->device->dealloc_pd(pd);
+	WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
 }
 EXPORT_SYMBOL(ib_dealloc_pd);
 
@@ -1144,73 +1195,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
 }
 EXPORT_SYMBOL(ib_get_dma_mr);
 
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
-			     struct ib_phys_buf *phys_buf_array,
-			     int num_phys_buf,
-			     int mr_access_flags,
-			     u64 *iova_start)
-{
-	struct ib_mr *mr;
-	int err;
-
-	err = ib_check_mr_access(mr_access_flags);
-	if (err)
-		return ERR_PTR(err);
-
-	if (!pd->device->reg_phys_mr)
-		return ERR_PTR(-ENOSYS);
-
-	mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
-				     mr_access_flags, iova_start);
-
-	if (!IS_ERR(mr)) {
-		mr->device  = pd->device;
-		mr->pd      = pd;
-		mr->uobject = NULL;
-		atomic_inc(&pd->usecnt);
-		atomic_set(&mr->usecnt, 0);
-	}
-
-	return mr;
-}
-EXPORT_SYMBOL(ib_reg_phys_mr);
-
-int ib_rereg_phys_mr(struct ib_mr *mr,
-		     int mr_rereg_mask,
-		     struct ib_pd *pd,
-		     struct ib_phys_buf *phys_buf_array,
-		     int num_phys_buf,
-		     int mr_access_flags,
-		     u64 *iova_start)
-{
-	struct ib_pd *old_pd;
-	int ret;
-
-	ret = ib_check_mr_access(mr_access_flags);
-	if (ret)
-		return ret;
-
-	if (!mr->device->rereg_phys_mr)
-		return -ENOSYS;
-
-	if (atomic_read(&mr->usecnt))
-		return -EBUSY;
-
-	old_pd = mr->pd;
-
-	ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
-					phys_buf_array, num_phys_buf,
-					mr_access_flags, iova_start);
-
-	if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
-		atomic_dec(&old_pd->usecnt);
-		atomic_inc(&pd->usecnt);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL(ib_rereg_phys_mr);
-
 int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
 {
 	return mr->device->query_mr ?
@@ -1235,54 +1219,28 @@ int ib_dereg_mr(struct ib_mr *mr)
 }
 EXPORT_SYMBOL(ib_dereg_mr);
 
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
-			   struct ib_mr_init_attr *mr_init_attr)
-{
-	struct ib_mr *mr;
-
-	if (!pd->device->create_mr)
-		return ERR_PTR(-ENOSYS);
-
-	mr = pd->device->create_mr(pd, mr_init_attr);
-
-	if (!IS_ERR(mr)) {
-		mr->device  = pd->device;
-		mr->pd      = pd;
-		mr->uobject = NULL;
-		atomic_inc(&pd->usecnt);
-		atomic_set(&mr->usecnt, 0);
-	}
-
-	return mr;
-}
-EXPORT_SYMBOL(ib_create_mr);
-
-int ib_destroy_mr(struct ib_mr *mr)
-{
-	struct ib_pd *pd;
-	int ret;
-
-	if (atomic_read(&mr->usecnt))
-		return -EBUSY;
-
-	pd = mr->pd;
-	ret = mr->device->destroy_mr(mr);
-	if (!ret)
-		atomic_dec(&pd->usecnt);
-
-	return ret;
-}
-EXPORT_SYMBOL(ib_destroy_mr);
-
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd:            protection domain associated with the region
+ * @mr_type:       memory region type
+ * @max_num_sg:    maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+			  enum ib_mr_type mr_type,
+			  u32 max_num_sg)
 {
 	struct ib_mr *mr;
 
-	if (!pd->device->alloc_fast_reg_mr)
+	if (!pd->device->alloc_mr)
 		return ERR_PTR(-ENOSYS);
 
-	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
-
+	mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
 	if (!IS_ERR(mr)) {
 		mr->device  = pd->device;
 		mr->pd      = pd;
@@ -1293,7 +1251,7 @@ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
 
 	return mr;
 }
-EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+EXPORT_SYMBOL(ib_alloc_mr);
 
 struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
 							  int max_page_list_len)
-- 
cgit v1.2.3-54-g00ecf